sock.c revision 5dbe7c178d3f0a4634f088d9e729f1909b9ddcd1
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 *		Alan Cox	: 	Numerous verify_area() problems
17 *		Alan Cox	:	Connecting on a connecting socket
18 *					now returns an error for tcp.
19 *		Alan Cox	:	sock->protocol is set correctly.
20 *					and is not sometimes left as 0.
21 *		Alan Cox	:	connect handles icmp errors on a
22 *					connect properly. Unfortunately there
23 *					is a restart syscall nasty there. I
24 *					can't match BSD without hacking the C
25 *					library. Ideas urgently sought!
26 *		Alan Cox	:	Disallow bind() to addresses that are
27 *					not ours - especially broadcast ones!!
28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30 *					instead they leave that for the DESTROY timer.
31 *		Alan Cox	:	Clean up error flag in accept
32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33 *					was buggy. Put a remove_sock() in the handler
34 *					for memory when we hit 0. Also altered the timer
35 *					code. The ACK stuff can wait and needs major
36 *					TCP layer surgery.
37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38 *					and fixed timer/inet_bh race.
39 *		Alan Cox	:	Added zapped flag for TCP
40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47 *	Pauline Middelink	:	identd support
48 *		Alan Cox	:	Fixed connect() taking signals I think.
49 *		Alan Cox	:	SO_LINGER supported
50 *		Alan Cox	:	Error reporting fixes
51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52 *		Alan Cox	:	inet sockets don't set sk->type!
53 *		Alan Cox	:	Split socket option code
54 *		Alan Cox	:	Callbacks
55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56 *		Alex		:	Removed restriction on inet fioctl
57 *		Alan Cox	:	Splitting INET from NET core
58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60 *		Alan Cox	:	Split IP from generic code
61 *		Alan Cox	:	New kfree_skbmem()
62 *		Alan Cox	:	Make SO_DEBUG superuser only.
63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64 *					(compatibility fix)
65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66 *		Alan Cox	:	Allocator for a socket is settable.
67 *		Alan Cox	:	SO_ERROR includes soft errors.
68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69 *		Alan Cox	: 	Generic socket allocation to make hooks
70 *					easier (suggested by Craig Metz).
71 *		Michael Pall	:	SO_ERROR returns positive errno again
72 *              Steve Whitehouse:       Added default destructor to free
73 *                                      protocol private data.
74 *              Steve Whitehouse:       Added various other default routines
75 *                                      common to several socket families.
76 *              Chris Evans     :       Call suser() check last on F_SETOWN
77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79 *		Andi Kleen	:	Fix write_space callback
80 *		Chris Evans	:	Security fixes - signedness again
81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 *		This program is free software; you can redistribute it and/or
87 *		modify it under the terms of the GNU General Public License
88 *		as published by the Free Software Foundation; either version
89 *		2 of the License, or (at your option) any later version.
90 */
91
92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
94#include <linux/capability.h>
95#include <linux/errno.h>
96#include <linux/types.h>
97#include <linux/socket.h>
98#include <linux/in.h>
99#include <linux/kernel.h>
100#include <linux/module.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/sched.h>
104#include <linux/timer.h>
105#include <linux/string.h>
106#include <linux/sockios.h>
107#include <linux/net.h>
108#include <linux/mm.h>
109#include <linux/slab.h>
110#include <linux/interrupt.h>
111#include <linux/poll.h>
112#include <linux/tcp.h>
113#include <linux/init.h>
114#include <linux/highmem.h>
115#include <linux/user_namespace.h>
116#include <linux/static_key.h>
117#include <linux/memcontrol.h>
118#include <linux/prefetch.h>
119
120#include <asm/uaccess.h>
121
122#include <linux/netdevice.h>
123#include <net/protocol.h>
124#include <linux/skbuff.h>
125#include <net/net_namespace.h>
126#include <net/request_sock.h>
127#include <net/sock.h>
128#include <linux/net_tstamp.h>
129#include <net/xfrm.h>
130#include <linux/ipsec.h>
131#include <net/cls_cgroup.h>
132#include <net/netprio_cgroup.h>
133
134#include <linux/filter.h>
135
136#include <trace/events/sock.h>
137
138#ifdef CONFIG_INET
139#include <net/tcp.h>
140#endif
141
142static DEFINE_MUTEX(proto_list_mutex);
143static LIST_HEAD(proto_list);
144
145#ifdef CONFIG_MEMCG_KMEM
146int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
147{
148	struct proto *proto;
149	int ret = 0;
150
151	mutex_lock(&proto_list_mutex);
152	list_for_each_entry(proto, &proto_list, node) {
153		if (proto->init_cgroup) {
154			ret = proto->init_cgroup(memcg, ss);
155			if (ret)
156				goto out;
157		}
158	}
159
160	mutex_unlock(&proto_list_mutex);
161	return ret;
162out:
163	list_for_each_entry_continue_reverse(proto, &proto_list, node)
164		if (proto->destroy_cgroup)
165			proto->destroy_cgroup(memcg);
166	mutex_unlock(&proto_list_mutex);
167	return ret;
168}
169
170void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
171{
172	struct proto *proto;
173
174	mutex_lock(&proto_list_mutex);
175	list_for_each_entry_reverse(proto, &proto_list, node)
176		if (proto->destroy_cgroup)
177			proto->destroy_cgroup(memcg);
178	mutex_unlock(&proto_list_mutex);
179}
180#endif
181
182/*
183 * Each address family might have different locking rules, so we have
184 * one slock key per address family:
185 */
186static struct lock_class_key af_family_keys[AF_MAX];
187static struct lock_class_key af_family_slock_keys[AF_MAX];
188
189#if defined(CONFIG_MEMCG_KMEM)
190struct static_key memcg_socket_limit_enabled;
191EXPORT_SYMBOL(memcg_socket_limit_enabled);
192#endif
193
194/*
195 * Make lock validator output more readable. (we pre-construct these
196 * strings build-time, so that runtime initialization of socket
197 * locks is fast):
198 */
199static const char *const af_family_key_strings[AF_MAX+1] = {
200  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
201  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
202  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
203  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
204  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
205  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
206  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
207  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
208  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
209  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
210  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
211  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
212  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
213  "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
214};
215static const char *const af_family_slock_key_strings[AF_MAX+1] = {
216  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
217  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
218  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
219  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
220  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
221  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
222  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
223  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
224  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
225  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
226  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
227  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
228  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
229  "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
230};
231static const char *const af_family_clock_key_strings[AF_MAX+1] = {
232  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
233  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
234  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
235  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
236  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
237  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
238  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
239  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
240  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
241  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
242  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
243  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
244  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
245  "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
246};
247
248/*
249 * sk_callback_lock locking rules are per-address-family,
250 * so split the lock classes by using a per-AF key:
251 */
252static struct lock_class_key af_callback_keys[AF_MAX];
253
254/* Take into consideration the size of the struct sk_buff overhead in the
255 * determination of these values, since that is non-constant across
256 * platforms.  This makes socket queueing behavior and performance
257 * not depend upon such differences.
258 */
259#define _SK_MEM_PACKETS		256
260#define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
261#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
262#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
263
264/* Run time adjustable parameters. */
265__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
266EXPORT_SYMBOL(sysctl_wmem_max);
267__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
268EXPORT_SYMBOL(sysctl_rmem_max);
269__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
270__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
271
272/* Maximal space eaten by iovec or ancillary data plus some space */
273int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
274EXPORT_SYMBOL(sysctl_optmem_max);
275
276struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
277EXPORT_SYMBOL_GPL(memalloc_socks);
278
279/**
280 * sk_set_memalloc - sets %SOCK_MEMALLOC
281 * @sk: socket to set it on
282 *
283 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
284 * It's the responsibility of the admin to adjust min_free_kbytes
285 * to meet the requirements
286 */
287void sk_set_memalloc(struct sock *sk)
288{
289	sock_set_flag(sk, SOCK_MEMALLOC);
290	sk->sk_allocation |= __GFP_MEMALLOC;
291	static_key_slow_inc(&memalloc_socks);
292}
293EXPORT_SYMBOL_GPL(sk_set_memalloc);
294
295void sk_clear_memalloc(struct sock *sk)
296{
297	sock_reset_flag(sk, SOCK_MEMALLOC);
298	sk->sk_allocation &= ~__GFP_MEMALLOC;
299	static_key_slow_dec(&memalloc_socks);
300
301	/*
302	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
303	 * progress of swapping. However, if SOCK_MEMALLOC is cleared while
304	 * it has rmem allocations there is a risk that the user of the
305	 * socket cannot make forward progress due to exceeding the rmem
306	 * limits. By rights, sk_clear_memalloc() should only be called
307	 * on sockets being torn down but warn and reset the accounting if
308	 * that assumption breaks.
309	 */
310	if (WARN_ON(sk->sk_forward_alloc))
311		sk_mem_reclaim(sk);
312}
313EXPORT_SYMBOL_GPL(sk_clear_memalloc);
314
315int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
316{
317	int ret;
318	unsigned long pflags = current->flags;
319
320	/* these should have been dropped before queueing */
321	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
322
323	current->flags |= PF_MEMALLOC;
324	ret = sk->sk_backlog_rcv(sk, skb);
325	tsk_restore_flags(current, pflags, PF_MEMALLOC);
326
327	return ret;
328}
329EXPORT_SYMBOL(__sk_backlog_rcv);
330
331static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
332{
333	struct timeval tv;
334
335	if (optlen < sizeof(tv))
336		return -EINVAL;
337	if (copy_from_user(&tv, optval, sizeof(tv)))
338		return -EFAULT;
339	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
340		return -EDOM;
341
342	if (tv.tv_sec < 0) {
343		static int warned __read_mostly;
344
345		*timeo_p = 0;
346		if (warned < 10 && net_ratelimit()) {
347			warned++;
348			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
349				__func__, current->comm, task_pid_nr(current));
350		}
351		return 0;
352	}
353	*timeo_p = MAX_SCHEDULE_TIMEOUT;
354	if (tv.tv_sec == 0 && tv.tv_usec == 0)
355		return 0;
356	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
357		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
358	return 0;
359}
360
361static void sock_warn_obsolete_bsdism(const char *name)
362{
363	static int warned;
364	static char warncomm[TASK_COMM_LEN];
365	if (strcmp(warncomm, current->comm) && warned < 5) {
366		strcpy(warncomm,  current->comm);
367		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
368			warncomm, name);
369		warned++;
370	}
371}
372
373#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
374
375static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
376{
377	if (sk->sk_flags & flags) {
378		sk->sk_flags &= ~flags;
379		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
380			net_disable_timestamp();
381	}
382}
383
384
385int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
386{
387	int err;
388	int skb_len;
389	unsigned long flags;
390	struct sk_buff_head *list = &sk->sk_receive_queue;
391
392	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
393		atomic_inc(&sk->sk_drops);
394		trace_sock_rcvqueue_full(sk, skb);
395		return -ENOMEM;
396	}
397
398	err = sk_filter(sk, skb);
399	if (err)
400		return err;
401
402	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
403		atomic_inc(&sk->sk_drops);
404		return -ENOBUFS;
405	}
406
407	skb->dev = NULL;
408	skb_set_owner_r(skb, sk);
409
410	/* Cache the SKB length before we tack it onto the receive
411	 * queue.  Once it is added it no longer belongs to us and
412	 * may be freed by other threads of control pulling packets
413	 * from the queue.
414	 */
415	skb_len = skb->len;
416
417	/* we escape from rcu protected region, make sure we dont leak
418	 * a norefcounted dst
419	 */
420	skb_dst_force(skb);
421
422	spin_lock_irqsave(&list->lock, flags);
423	skb->dropcount = atomic_read(&sk->sk_drops);
424	__skb_queue_tail(list, skb);
425	spin_unlock_irqrestore(&list->lock, flags);
426
427	if (!sock_flag(sk, SOCK_DEAD))
428		sk->sk_data_ready(sk, skb_len);
429	return 0;
430}
431EXPORT_SYMBOL(sock_queue_rcv_skb);
432
433int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
434{
435	int rc = NET_RX_SUCCESS;
436
437	if (sk_filter(sk, skb))
438		goto discard_and_relse;
439
440	skb->dev = NULL;
441
442	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
443		atomic_inc(&sk->sk_drops);
444		goto discard_and_relse;
445	}
446	if (nested)
447		bh_lock_sock_nested(sk);
448	else
449		bh_lock_sock(sk);
450	if (!sock_owned_by_user(sk)) {
451		/*
452		 * trylock + unlock semantics:
453		 */
454		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
455
456		rc = sk_backlog_rcv(sk, skb);
457
458		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
459	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
460		bh_unlock_sock(sk);
461		atomic_inc(&sk->sk_drops);
462		goto discard_and_relse;
463	}
464
465	bh_unlock_sock(sk);
466out:
467	sock_put(sk);
468	return rc;
469discard_and_relse:
470	kfree_skb(skb);
471	goto out;
472}
473EXPORT_SYMBOL(sk_receive_skb);
474
475void sk_reset_txq(struct sock *sk)
476{
477	sk_tx_queue_clear(sk);
478}
479EXPORT_SYMBOL(sk_reset_txq);
480
481struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
482{
483	struct dst_entry *dst = __sk_dst_get(sk);
484
485	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
486		sk_tx_queue_clear(sk);
487		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
488		dst_release(dst);
489		return NULL;
490	}
491
492	return dst;
493}
494EXPORT_SYMBOL(__sk_dst_check);
495
496struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
497{
498	struct dst_entry *dst = sk_dst_get(sk);
499
500	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
501		sk_dst_reset(sk);
502		dst_release(dst);
503		return NULL;
504	}
505
506	return dst;
507}
508EXPORT_SYMBOL(sk_dst_check);
509
510static int sock_setbindtodevice(struct sock *sk, char __user *optval,
511				int optlen)
512{
513	int ret = -ENOPROTOOPT;
514#ifdef CONFIG_NETDEVICES
515	struct net *net = sock_net(sk);
516	char devname[IFNAMSIZ];
517	int index;
518
519	/* Sorry... */
520	ret = -EPERM;
521	if (!ns_capable(net->user_ns, CAP_NET_RAW))
522		goto out;
523
524	ret = -EINVAL;
525	if (optlen < 0)
526		goto out;
527
528	/* Bind this socket to a particular device like "eth0",
529	 * as specified in the passed interface name. If the
530	 * name is "" or the option length is zero the socket
531	 * is not bound.
532	 */
533	if (optlen > IFNAMSIZ - 1)
534		optlen = IFNAMSIZ - 1;
535	memset(devname, 0, sizeof(devname));
536
537	ret = -EFAULT;
538	if (copy_from_user(devname, optval, optlen))
539		goto out;
540
541	index = 0;
542	if (devname[0] != '\0') {
543		struct net_device *dev;
544
545		rcu_read_lock();
546		dev = dev_get_by_name_rcu(net, devname);
547		if (dev)
548			index = dev->ifindex;
549		rcu_read_unlock();
550		ret = -ENODEV;
551		if (!dev)
552			goto out;
553	}
554
555	lock_sock(sk);
556	sk->sk_bound_dev_if = index;
557	sk_dst_reset(sk);
558	release_sock(sk);
559
560	ret = 0;
561
562out:
563#endif
564
565	return ret;
566}
567
568static int sock_getbindtodevice(struct sock *sk, char __user *optval,
569				int __user *optlen, int len)
570{
571	int ret = -ENOPROTOOPT;
572#ifdef CONFIG_NETDEVICES
573	struct net *net = sock_net(sk);
574	char devname[IFNAMSIZ];
575
576	if (sk->sk_bound_dev_if == 0) {
577		len = 0;
578		goto zero;
579	}
580
581	ret = -EINVAL;
582	if (len < IFNAMSIZ)
583		goto out;
584
585	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
586	if (ret)
587		goto out;
588
589	len = strlen(devname) + 1;
590
591	ret = -EFAULT;
592	if (copy_to_user(optval, devname, len))
593		goto out;
594
595zero:
596	ret = -EFAULT;
597	if (put_user(len, optlen))
598		goto out;
599
600	ret = 0;
601
602out:
603#endif
604
605	return ret;
606}
607
608static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
609{
610	if (valbool)
611		sock_set_flag(sk, bit);
612	else
613		sock_reset_flag(sk, bit);
614}
615
616/*
617 *	This is meant for all protocols to use and covers goings on
618 *	at the socket level. Everything here is generic.
619 */
620
621int sock_setsockopt(struct socket *sock, int level, int optname,
622		    char __user *optval, unsigned int optlen)
623{
624	struct sock *sk = sock->sk;
625	int val;
626	int valbool;
627	struct linger ling;
628	int ret = 0;
629
630	/*
631	 *	Options without arguments
632	 */
633
634	if (optname == SO_BINDTODEVICE)
635		return sock_setbindtodevice(sk, optval, optlen);
636
637	if (optlen < sizeof(int))
638		return -EINVAL;
639
640	if (get_user(val, (int __user *)optval))
641		return -EFAULT;
642
643	valbool = val ? 1 : 0;
644
645	lock_sock(sk);
646
647	switch (optname) {
648	case SO_DEBUG:
649		if (val && !capable(CAP_NET_ADMIN))
650			ret = -EACCES;
651		else
652			sock_valbool_flag(sk, SOCK_DBG, valbool);
653		break;
654	case SO_REUSEADDR:
655		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
656		break;
657	case SO_REUSEPORT:
658		sk->sk_reuseport = valbool;
659		break;
660	case SO_TYPE:
661	case SO_PROTOCOL:
662	case SO_DOMAIN:
663	case SO_ERROR:
664		ret = -ENOPROTOOPT;
665		break;
666	case SO_DONTROUTE:
667		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
668		break;
669	case SO_BROADCAST:
670		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
671		break;
672	case SO_SNDBUF:
673		/* Don't error on this BSD doesn't and if you think
674		 * about it this is right. Otherwise apps have to
675		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
676		 * are treated in BSD as hints
677		 */
678		val = min_t(u32, val, sysctl_wmem_max);
679set_sndbuf:
680		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
681		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
682		/* Wake up sending tasks if we upped the value. */
683		sk->sk_write_space(sk);
684		break;
685
686	case SO_SNDBUFFORCE:
687		if (!capable(CAP_NET_ADMIN)) {
688			ret = -EPERM;
689			break;
690		}
691		goto set_sndbuf;
692
693	case SO_RCVBUF:
694		/* Don't error on this BSD doesn't and if you think
695		 * about it this is right. Otherwise apps have to
696		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
697		 * are treated in BSD as hints
698		 */
699		val = min_t(u32, val, sysctl_rmem_max);
700set_rcvbuf:
701		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
702		/*
703		 * We double it on the way in to account for
704		 * "struct sk_buff" etc. overhead.   Applications
705		 * assume that the SO_RCVBUF setting they make will
706		 * allow that much actual data to be received on that
707		 * socket.
708		 *
709		 * Applications are unaware that "struct sk_buff" and
710		 * other overheads allocate from the receive buffer
711		 * during socket buffer allocation.
712		 *
713		 * And after considering the possible alternatives,
714		 * returning the value we actually used in getsockopt
715		 * is the most desirable behavior.
716		 */
717		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
718		break;
719
720	case SO_RCVBUFFORCE:
721		if (!capable(CAP_NET_ADMIN)) {
722			ret = -EPERM;
723			break;
724		}
725		goto set_rcvbuf;
726
727	case SO_KEEPALIVE:
728#ifdef CONFIG_INET
729		if (sk->sk_protocol == IPPROTO_TCP &&
730		    sk->sk_type == SOCK_STREAM)
731			tcp_set_keepalive(sk, valbool);
732#endif
733		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
734		break;
735
736	case SO_OOBINLINE:
737		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
738		break;
739
740	case SO_NO_CHECK:
741		sk->sk_no_check = valbool;
742		break;
743
744	case SO_PRIORITY:
745		if ((val >= 0 && val <= 6) ||
746		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
747			sk->sk_priority = val;
748		else
749			ret = -EPERM;
750		break;
751
752	case SO_LINGER:
753		if (optlen < sizeof(ling)) {
754			ret = -EINVAL;	/* 1003.1g */
755			break;
756		}
757		if (copy_from_user(&ling, optval, sizeof(ling))) {
758			ret = -EFAULT;
759			break;
760		}
761		if (!ling.l_onoff)
762			sock_reset_flag(sk, SOCK_LINGER);
763		else {
764#if (BITS_PER_LONG == 32)
765			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
766				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
767			else
768#endif
769				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
770			sock_set_flag(sk, SOCK_LINGER);
771		}
772		break;
773
774	case SO_BSDCOMPAT:
775		sock_warn_obsolete_bsdism("setsockopt");
776		break;
777
778	case SO_PASSCRED:
779		if (valbool)
780			set_bit(SOCK_PASSCRED, &sock->flags);
781		else
782			clear_bit(SOCK_PASSCRED, &sock->flags);
783		break;
784
785	case SO_TIMESTAMP:
786	case SO_TIMESTAMPNS:
787		if (valbool)  {
788			if (optname == SO_TIMESTAMP)
789				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
790			else
791				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
792			sock_set_flag(sk, SOCK_RCVTSTAMP);
793			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
794		} else {
795			sock_reset_flag(sk, SOCK_RCVTSTAMP);
796			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
797		}
798		break;
799
800	case SO_TIMESTAMPING:
801		if (val & ~SOF_TIMESTAMPING_MASK) {
802			ret = -EINVAL;
803			break;
804		}
805		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
806				  val & SOF_TIMESTAMPING_TX_HARDWARE);
807		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
808				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
809		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
810				  val & SOF_TIMESTAMPING_RX_HARDWARE);
811		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
812			sock_enable_timestamp(sk,
813					      SOCK_TIMESTAMPING_RX_SOFTWARE);
814		else
815			sock_disable_timestamp(sk,
816					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
817		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
818				  val & SOF_TIMESTAMPING_SOFTWARE);
819		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
820				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
821		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
822				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
823		break;
824
825	case SO_RCVLOWAT:
826		if (val < 0)
827			val = INT_MAX;
828		sk->sk_rcvlowat = val ? : 1;
829		break;
830
831	case SO_RCVTIMEO:
832		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
833		break;
834
835	case SO_SNDTIMEO:
836		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
837		break;
838
839	case SO_ATTACH_FILTER:
840		ret = -EINVAL;
841		if (optlen == sizeof(struct sock_fprog)) {
842			struct sock_fprog fprog;
843
844			ret = -EFAULT;
845			if (copy_from_user(&fprog, optval, sizeof(fprog)))
846				break;
847
848			ret = sk_attach_filter(&fprog, sk);
849		}
850		break;
851
852	case SO_DETACH_FILTER:
853		ret = sk_detach_filter(sk);
854		break;
855
856	case SO_LOCK_FILTER:
857		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
858			ret = -EPERM;
859		else
860			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
861		break;
862
863	case SO_PASSSEC:
864		if (valbool)
865			set_bit(SOCK_PASSSEC, &sock->flags);
866		else
867			clear_bit(SOCK_PASSSEC, &sock->flags);
868		break;
869	case SO_MARK:
870		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
871			ret = -EPERM;
872		else
873			sk->sk_mark = val;
874		break;
875
876		/* We implement the SO_SNDLOWAT etc to
877		   not be settable (1003.1g 5.3) */
878	case SO_RXQ_OVFL:
879		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
880		break;
881
882	case SO_WIFI_STATUS:
883		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
884		break;
885
886	case SO_PEEK_OFF:
887		if (sock->ops->set_peek_off)
888			sock->ops->set_peek_off(sk, val);
889		else
890			ret = -EOPNOTSUPP;
891		break;
892
893	case SO_NOFCS:
894		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
895		break;
896
897	case SO_SELECT_ERR_QUEUE:
898		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
899		break;
900
901	default:
902		ret = -ENOPROTOOPT;
903		break;
904	}
905	release_sock(sk);
906	return ret;
907}
908EXPORT_SYMBOL(sock_setsockopt);
909
910
911void cred_to_ucred(struct pid *pid, const struct cred *cred,
912		   struct ucred *ucred)
913{
914	ucred->pid = pid_vnr(pid);
915	ucred->uid = ucred->gid = -1;
916	if (cred) {
917		struct user_namespace *current_ns = current_user_ns();
918
919		ucred->uid = from_kuid_munged(current_ns, cred->euid);
920		ucred->gid = from_kgid_munged(current_ns, cred->egid);
921	}
922}
923EXPORT_SYMBOL_GPL(cred_to_ucred);
924
925int sock_getsockopt(struct socket *sock, int level, int optname,
926		    char __user *optval, int __user *optlen)
927{
928	struct sock *sk = sock->sk;
929
930	union {
931		int val;
932		struct linger ling;
933		struct timeval tm;
934	} v;
935
936	int lv = sizeof(int);
937	int len;
938
939	if (get_user(len, optlen))
940		return -EFAULT;
941	if (len < 0)
942		return -EINVAL;
943
944	memset(&v, 0, sizeof(v));
945
946	switch (optname) {
947	case SO_DEBUG:
948		v.val = sock_flag(sk, SOCK_DBG);
949		break;
950
951	case SO_DONTROUTE:
952		v.val = sock_flag(sk, SOCK_LOCALROUTE);
953		break;
954
955	case SO_BROADCAST:
956		v.val = sock_flag(sk, SOCK_BROADCAST);
957		break;
958
959	case SO_SNDBUF:
960		v.val = sk->sk_sndbuf;
961		break;
962
963	case SO_RCVBUF:
964		v.val = sk->sk_rcvbuf;
965		break;
966
967	case SO_REUSEADDR:
968		v.val = sk->sk_reuse;
969		break;
970
971	case SO_REUSEPORT:
972		v.val = sk->sk_reuseport;
973		break;
974
975	case SO_KEEPALIVE:
976		v.val = sock_flag(sk, SOCK_KEEPOPEN);
977		break;
978
979	case SO_TYPE:
980		v.val = sk->sk_type;
981		break;
982
983	case SO_PROTOCOL:
984		v.val = sk->sk_protocol;
985		break;
986
987	case SO_DOMAIN:
988		v.val = sk->sk_family;
989		break;
990
991	case SO_ERROR:
992		v.val = -sock_error(sk);
993		if (v.val == 0)
994			v.val = xchg(&sk->sk_err_soft, 0);
995		break;
996
997	case SO_OOBINLINE:
998		v.val = sock_flag(sk, SOCK_URGINLINE);
999		break;
1000
1001	case SO_NO_CHECK:
1002		v.val = sk->sk_no_check;
1003		break;
1004
1005	case SO_PRIORITY:
1006		v.val = sk->sk_priority;
1007		break;
1008
1009	case SO_LINGER:
1010		lv		= sizeof(v.ling);
1011		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1012		v.ling.l_linger	= sk->sk_lingertime / HZ;
1013		break;
1014
1015	case SO_BSDCOMPAT:
1016		sock_warn_obsolete_bsdism("getsockopt");
1017		break;
1018
1019	case SO_TIMESTAMP:
1020		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1021				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1022		break;
1023
1024	case SO_TIMESTAMPNS:
1025		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1026		break;
1027
1028	case SO_TIMESTAMPING:
1029		v.val = 0;
1030		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
1031			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
1032		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
1033			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
1034		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
1035			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
1036		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1037			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
1038		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
1039			v.val |= SOF_TIMESTAMPING_SOFTWARE;
1040		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
1041			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
1042		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
1043			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
1044		break;
1045
1046	case SO_RCVTIMEO:
1047		lv = sizeof(struct timeval);
1048		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1049			v.tm.tv_sec = 0;
1050			v.tm.tv_usec = 0;
1051		} else {
1052			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1053			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1054		}
1055		break;
1056
1057	case SO_SNDTIMEO:
1058		lv = sizeof(struct timeval);
1059		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1060			v.tm.tv_sec = 0;
1061			v.tm.tv_usec = 0;
1062		} else {
1063			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1064			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1065		}
1066		break;
1067
1068	case SO_RCVLOWAT:
1069		v.val = sk->sk_rcvlowat;
1070		break;
1071
1072	case SO_SNDLOWAT:
1073		v.val = 1;
1074		break;
1075
1076	case SO_PASSCRED:
1077		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1078		break;
1079
1080	case SO_PEERCRED:
1081	{
1082		struct ucred peercred;
1083		if (len > sizeof(peercred))
1084			len = sizeof(peercred);
1085		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1086		if (copy_to_user(optval, &peercred, len))
1087			return -EFAULT;
1088		goto lenout;
1089	}
1090
1091	case SO_PEERNAME:
1092	{
1093		char address[128];
1094
1095		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1096			return -ENOTCONN;
1097		if (lv < len)
1098			return -EINVAL;
1099		if (copy_to_user(optval, address, len))
1100			return -EFAULT;
1101		goto lenout;
1102	}
1103
1104	/* Dubious BSD thing... Probably nobody even uses it, but
1105	 * the UNIX standard wants it for whatever reason... -DaveM
1106	 */
1107	case SO_ACCEPTCONN:
1108		v.val = sk->sk_state == TCP_LISTEN;
1109		break;
1110
1111	case SO_PASSSEC:
1112		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1113		break;
1114
1115	case SO_PEERSEC:
1116		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1117
1118	case SO_MARK:
1119		v.val = sk->sk_mark;
1120		break;
1121
1122	case SO_RXQ_OVFL:
1123		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1124		break;
1125
1126	case SO_WIFI_STATUS:
1127		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1128		break;
1129
1130	case SO_PEEK_OFF:
1131		if (!sock->ops->set_peek_off)
1132			return -EOPNOTSUPP;
1133
1134		v.val = sk->sk_peek_off;
1135		break;
1136	case SO_NOFCS:
1137		v.val = sock_flag(sk, SOCK_NOFCS);
1138		break;
1139
1140	case SO_BINDTODEVICE:
1141		return sock_getbindtodevice(sk, optval, optlen, len);
1142
1143	case SO_GET_FILTER:
1144		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1145		if (len < 0)
1146			return len;
1147
1148		goto lenout;
1149
1150	case SO_LOCK_FILTER:
1151		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1152		break;
1153
1154	case SO_SELECT_ERR_QUEUE:
1155		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1156		break;
1157
1158	default:
1159		return -ENOPROTOOPT;
1160	}
1161
1162	if (len > lv)
1163		len = lv;
1164	if (copy_to_user(optval, &v, len))
1165		return -EFAULT;
1166lenout:
1167	if (put_user(len, optlen))
1168		return -EFAULT;
1169	return 0;
1170}
1171
1172/*
1173 * Initialize an sk_lock.
1174 *
1175 * (We also register the sk_lock with the lock validator.)
1176 */
1177static inline void sock_lock_init(struct sock *sk)
1178{
1179	sock_lock_init_class_and_name(sk,
1180			af_family_slock_key_strings[sk->sk_family],
1181			af_family_slock_keys + sk->sk_family,
1182			af_family_key_strings[sk->sk_family],
1183			af_family_keys + sk->sk_family);
1184}
1185
1186/*
1187 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1188 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1189 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1190 */
1191static void sock_copy(struct sock *nsk, const struct sock *osk)
1192{
1193#ifdef CONFIG_SECURITY_NETWORK
1194	void *sptr = nsk->sk_security;
1195#endif
1196	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1197
1198	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1199	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1200
1201#ifdef CONFIG_SECURITY_NETWORK
1202	nsk->sk_security = sptr;
1203	security_sk_clone(osk, nsk);
1204#endif
1205}
1206
1207void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1208{
1209	unsigned long nulls1, nulls2;
1210
1211	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1212	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1213	if (nulls1 > nulls2)
1214		swap(nulls1, nulls2);
1215
1216	if (nulls1 != 0)
1217		memset((char *)sk, 0, nulls1);
1218	memset((char *)sk + nulls1 + sizeof(void *), 0,
1219	       nulls2 - nulls1 - sizeof(void *));
1220	memset((char *)sk + nulls2 + sizeof(void *), 0,
1221	       size - nulls2 - sizeof(void *));
1222}
1223EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1224
1225static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1226		int family)
1227{
1228	struct sock *sk;
1229	struct kmem_cache *slab;
1230
1231	slab = prot->slab;
1232	if (slab != NULL) {
1233		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1234		if (!sk)
1235			return sk;
1236		if (priority & __GFP_ZERO) {
1237			if (prot->clear_sk)
1238				prot->clear_sk(sk, prot->obj_size);
1239			else
1240				sk_prot_clear_nulls(sk, prot->obj_size);
1241		}
1242	} else
1243		sk = kmalloc(prot->obj_size, priority);
1244
1245	if (sk != NULL) {
1246		kmemcheck_annotate_bitfield(sk, flags);
1247
1248		if (security_sk_alloc(sk, family, priority))
1249			goto out_free;
1250
1251		if (!try_module_get(prot->owner))
1252			goto out_free_sec;
1253		sk_tx_queue_clear(sk);
1254	}
1255
1256	return sk;
1257
1258out_free_sec:
1259	security_sk_free(sk);
1260out_free:
1261	if (slab != NULL)
1262		kmem_cache_free(slab, sk);
1263	else
1264		kfree(sk);
1265	return NULL;
1266}
1267
1268static void sk_prot_free(struct proto *prot, struct sock *sk)
1269{
1270	struct kmem_cache *slab;
1271	struct module *owner;
1272
1273	owner = prot->owner;
1274	slab = prot->slab;
1275
1276	security_sk_free(sk);
1277	if (slab != NULL)
1278		kmem_cache_free(slab, sk);
1279	else
1280		kfree(sk);
1281	module_put(owner);
1282}
1283
1284#if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
1285void sock_update_classid(struct sock *sk)
1286{
1287	u32 classid;
1288
1289	classid = task_cls_classid(current);
1290	if (classid != sk->sk_classid)
1291		sk->sk_classid = classid;
1292}
1293EXPORT_SYMBOL(sock_update_classid);
1294#endif
1295
1296#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1297void sock_update_netprioidx(struct sock *sk)
1298{
1299	if (in_interrupt())
1300		return;
1301
1302	sk->sk_cgrp_prioidx = task_netprioidx(current);
1303}
1304EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1305#endif
1306
1307/**
1308 *	sk_alloc - All socket objects are allocated here
1309 *	@net: the applicable net namespace
1310 *	@family: protocol family
1311 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1312 *	@prot: struct proto associated with this new sock instance
1313 */
1314struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1315		      struct proto *prot)
1316{
1317	struct sock *sk;
1318
1319	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1320	if (sk) {
1321		sk->sk_family = family;
1322		/*
1323		 * See comment in struct sock definition to understand
1324		 * why we need sk_prot_creator -acme
1325		 */
1326		sk->sk_prot = sk->sk_prot_creator = prot;
1327		sock_lock_init(sk);
1328		sock_net_set(sk, get_net(net));
1329		atomic_set(&sk->sk_wmem_alloc, 1);
1330
1331		sock_update_classid(sk);
1332		sock_update_netprioidx(sk);
1333	}
1334
1335	return sk;
1336}
1337EXPORT_SYMBOL(sk_alloc);
1338
1339static void __sk_free(struct sock *sk)
1340{
1341	struct sk_filter *filter;
1342
1343	if (sk->sk_destruct)
1344		sk->sk_destruct(sk);
1345
1346	filter = rcu_dereference_check(sk->sk_filter,
1347				       atomic_read(&sk->sk_wmem_alloc) == 0);
1348	if (filter) {
1349		sk_filter_uncharge(sk, filter);
1350		RCU_INIT_POINTER(sk->sk_filter, NULL);
1351	}
1352
1353	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1354
1355	if (atomic_read(&sk->sk_omem_alloc))
1356		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1357			 __func__, atomic_read(&sk->sk_omem_alloc));
1358
1359	if (sk->sk_peer_cred)
1360		put_cred(sk->sk_peer_cred);
1361	put_pid(sk->sk_peer_pid);
1362	put_net(sock_net(sk));
1363	sk_prot_free(sk->sk_prot_creator, sk);
1364}
1365
1366void sk_free(struct sock *sk)
1367{
1368	/*
1369	 * We subtract one from sk_wmem_alloc and can know if
1370	 * some packets are still in some tx queue.
1371	 * If not null, sock_wfree() will call __sk_free(sk) later
1372	 */
1373	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1374		__sk_free(sk);
1375}
1376EXPORT_SYMBOL(sk_free);
1377
1378/*
1379 * Last sock_put should drop reference to sk->sk_net. It has already
1380 * been dropped in sk_change_net. Taking reference to stopping namespace
1381 * is not an option.
1382 * Take reference to a socket to remove it from hash _alive_ and after that
1383 * destroy it in the context of init_net.
1384 */
1385void sk_release_kernel(struct sock *sk)
1386{
1387	if (sk == NULL || sk->sk_socket == NULL)
1388		return;
1389
1390	sock_hold(sk);
1391	sock_release(sk->sk_socket);
1392	release_net(sock_net(sk));
1393	sock_net_set(sk, get_net(&init_net));
1394	sock_put(sk);
1395}
1396EXPORT_SYMBOL(sk_release_kernel);
1397
1398static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1399{
1400	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1401		sock_update_memcg(newsk);
1402}
1403
1404/**
1405 *	sk_clone_lock - clone a socket, and lock its clone
1406 *	@sk: the socket to clone
1407 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1408 *
1409 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1410 */
1411struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1412{
1413	struct sock *newsk;
1414
1415	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1416	if (newsk != NULL) {
1417		struct sk_filter *filter;
1418
1419		sock_copy(newsk, sk);
1420
1421		/* SANITY */
1422		get_net(sock_net(newsk));
1423		sk_node_init(&newsk->sk_node);
1424		sock_lock_init(newsk);
1425		bh_lock_sock(newsk);
1426		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1427		newsk->sk_backlog.len = 0;
1428
1429		atomic_set(&newsk->sk_rmem_alloc, 0);
1430		/*
1431		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1432		 */
1433		atomic_set(&newsk->sk_wmem_alloc, 1);
1434		atomic_set(&newsk->sk_omem_alloc, 0);
1435		skb_queue_head_init(&newsk->sk_receive_queue);
1436		skb_queue_head_init(&newsk->sk_write_queue);
1437#ifdef CONFIG_NET_DMA
1438		skb_queue_head_init(&newsk->sk_async_wait_queue);
1439#endif
1440
1441		spin_lock_init(&newsk->sk_dst_lock);
1442		rwlock_init(&newsk->sk_callback_lock);
1443		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1444				af_callback_keys + newsk->sk_family,
1445				af_family_clock_key_strings[newsk->sk_family]);
1446
1447		newsk->sk_dst_cache	= NULL;
1448		newsk->sk_wmem_queued	= 0;
1449		newsk->sk_forward_alloc = 0;
1450		newsk->sk_send_head	= NULL;
1451		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1452
1453		sock_reset_flag(newsk, SOCK_DONE);
1454		skb_queue_head_init(&newsk->sk_error_queue);
1455
1456		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1457		if (filter != NULL)
1458			sk_filter_charge(newsk, filter);
1459
1460		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1461			/* It is still raw copy of parent, so invalidate
1462			 * destructor and make plain sk_free() */
1463			newsk->sk_destruct = NULL;
1464			bh_unlock_sock(newsk);
1465			sk_free(newsk);
1466			newsk = NULL;
1467			goto out;
1468		}
1469
1470		newsk->sk_err	   = 0;
1471		newsk->sk_priority = 0;
1472		/*
1473		 * Before updating sk_refcnt, we must commit prior changes to memory
1474		 * (Documentation/RCU/rculist_nulls.txt for details)
1475		 */
1476		smp_wmb();
1477		atomic_set(&newsk->sk_refcnt, 2);
1478
1479		/*
1480		 * Increment the counter in the same struct proto as the master
1481		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1482		 * is the same as sk->sk_prot->socks, as this field was copied
1483		 * with memcpy).
1484		 *
1485		 * This _changes_ the previous behaviour, where
1486		 * tcp_create_openreq_child always was incrementing the
1487		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1488		 * to be taken into account in all callers. -acme
1489		 */
1490		sk_refcnt_debug_inc(newsk);
1491		sk_set_socket(newsk, NULL);
1492		newsk->sk_wq = NULL;
1493
1494		sk_update_clone(sk, newsk);
1495
1496		if (newsk->sk_prot->sockets_allocated)
1497			sk_sockets_allocated_inc(newsk);
1498
1499		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1500			net_enable_timestamp();
1501	}
1502out:
1503	return newsk;
1504}
1505EXPORT_SYMBOL_GPL(sk_clone_lock);
1506
1507void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1508{
1509	__sk_dst_set(sk, dst);
1510	sk->sk_route_caps = dst->dev->features;
1511	if (sk->sk_route_caps & NETIF_F_GSO)
1512		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1513	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1514	if (sk_can_gso(sk)) {
1515		if (dst->header_len) {
1516			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1517		} else {
1518			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1519			sk->sk_gso_max_size = dst->dev->gso_max_size;
1520			sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1521		}
1522	}
1523}
1524EXPORT_SYMBOL_GPL(sk_setup_caps);
1525
1526/*
1527 *	Simple resource managers for sockets.
1528 */
1529
1530
1531/*
1532 * Write buffer destructor automatically called from kfree_skb.
1533 */
1534void sock_wfree(struct sk_buff *skb)
1535{
1536	struct sock *sk = skb->sk;
1537	unsigned int len = skb->truesize;
1538
1539	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1540		/*
1541		 * Keep a reference on sk_wmem_alloc, this will be released
1542		 * after sk_write_space() call
1543		 */
1544		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1545		sk->sk_write_space(sk);
1546		len = 1;
1547	}
1548	/*
1549	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1550	 * could not do because of in-flight packets
1551	 */
1552	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1553		__sk_free(sk);
1554}
1555EXPORT_SYMBOL(sock_wfree);
1556
1557/*
1558 * Read buffer destructor automatically called from kfree_skb.
1559 */
1560void sock_rfree(struct sk_buff *skb)
1561{
1562	struct sock *sk = skb->sk;
1563	unsigned int len = skb->truesize;
1564
1565	atomic_sub(len, &sk->sk_rmem_alloc);
1566	sk_mem_uncharge(sk, len);
1567}
1568EXPORT_SYMBOL(sock_rfree);
1569
1570void sock_edemux(struct sk_buff *skb)
1571{
1572	struct sock *sk = skb->sk;
1573
1574#ifdef CONFIG_INET
1575	if (sk->sk_state == TCP_TIME_WAIT)
1576		inet_twsk_put(inet_twsk(sk));
1577	else
1578#endif
1579		sock_put(sk);
1580}
1581EXPORT_SYMBOL(sock_edemux);
1582
1583kuid_t sock_i_uid(struct sock *sk)
1584{
1585	kuid_t uid;
1586
1587	read_lock_bh(&sk->sk_callback_lock);
1588	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1589	read_unlock_bh(&sk->sk_callback_lock);
1590	return uid;
1591}
1592EXPORT_SYMBOL(sock_i_uid);
1593
1594unsigned long sock_i_ino(struct sock *sk)
1595{
1596	unsigned long ino;
1597
1598	read_lock_bh(&sk->sk_callback_lock);
1599	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1600	read_unlock_bh(&sk->sk_callback_lock);
1601	return ino;
1602}
1603EXPORT_SYMBOL(sock_i_ino);
1604
1605/*
1606 * Allocate a skb from the socket's send buffer.
1607 */
1608struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1609			     gfp_t priority)
1610{
1611	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1612		struct sk_buff *skb = alloc_skb(size, priority);
1613		if (skb) {
1614			skb_set_owner_w(skb, sk);
1615			return skb;
1616		}
1617	}
1618	return NULL;
1619}
1620EXPORT_SYMBOL(sock_wmalloc);
1621
1622/*
1623 * Allocate a skb from the socket's receive buffer.
1624 */
1625struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1626			     gfp_t priority)
1627{
1628	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1629		struct sk_buff *skb = alloc_skb(size, priority);
1630		if (skb) {
1631			skb_set_owner_r(skb, sk);
1632			return skb;
1633		}
1634	}
1635	return NULL;
1636}
1637
1638/*
1639 * Allocate a memory block from the socket's option memory buffer.
1640 */
1641void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1642{
1643	if ((unsigned int)size <= sysctl_optmem_max &&
1644	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1645		void *mem;
1646		/* First do the add, to avoid the race if kmalloc
1647		 * might sleep.
1648		 */
1649		atomic_add(size, &sk->sk_omem_alloc);
1650		mem = kmalloc(size, priority);
1651		if (mem)
1652			return mem;
1653		atomic_sub(size, &sk->sk_omem_alloc);
1654	}
1655	return NULL;
1656}
1657EXPORT_SYMBOL(sock_kmalloc);
1658
1659/*
1660 * Free an option memory block.
1661 */
1662void sock_kfree_s(struct sock *sk, void *mem, int size)
1663{
1664	kfree(mem);
1665	atomic_sub(size, &sk->sk_omem_alloc);
1666}
1667EXPORT_SYMBOL(sock_kfree_s);
1668
1669/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1670   I think, these locks should be removed for datagram sockets.
1671 */
1672static long sock_wait_for_wmem(struct sock *sk, long timeo)
1673{
1674	DEFINE_WAIT(wait);
1675
1676	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1677	for (;;) {
1678		if (!timeo)
1679			break;
1680		if (signal_pending(current))
1681			break;
1682		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1683		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1684		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1685			break;
1686		if (sk->sk_shutdown & SEND_SHUTDOWN)
1687			break;
1688		if (sk->sk_err)
1689			break;
1690		timeo = schedule_timeout(timeo);
1691	}
1692	finish_wait(sk_sleep(sk), &wait);
1693	return timeo;
1694}
1695
1696
1697/*
1698 *	Generic send/receive buffer handlers
1699 */
1700
1701struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1702				     unsigned long data_len, int noblock,
1703				     int *errcode)
1704{
1705	struct sk_buff *skb;
1706	gfp_t gfp_mask;
1707	long timeo;
1708	int err;
1709	int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1710
1711	err = -EMSGSIZE;
1712	if (npages > MAX_SKB_FRAGS)
1713		goto failure;
1714
1715	gfp_mask = sk->sk_allocation;
1716	if (gfp_mask & __GFP_WAIT)
1717		gfp_mask |= __GFP_REPEAT;
1718
1719	timeo = sock_sndtimeo(sk, noblock);
1720	while (1) {
1721		err = sock_error(sk);
1722		if (err != 0)
1723			goto failure;
1724
1725		err = -EPIPE;
1726		if (sk->sk_shutdown & SEND_SHUTDOWN)
1727			goto failure;
1728
1729		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1730			skb = alloc_skb(header_len, gfp_mask);
1731			if (skb) {
1732				int i;
1733
1734				/* No pages, we're done... */
1735				if (!data_len)
1736					break;
1737
1738				skb->truesize += data_len;
1739				skb_shinfo(skb)->nr_frags = npages;
1740				for (i = 0; i < npages; i++) {
1741					struct page *page;
1742
1743					page = alloc_pages(sk->sk_allocation, 0);
1744					if (!page) {
1745						err = -ENOBUFS;
1746						skb_shinfo(skb)->nr_frags = i;
1747						kfree_skb(skb);
1748						goto failure;
1749					}
1750
1751					__skb_fill_page_desc(skb, i,
1752							page, 0,
1753							(data_len >= PAGE_SIZE ?
1754							 PAGE_SIZE :
1755							 data_len));
1756					data_len -= PAGE_SIZE;
1757				}
1758
1759				/* Full success... */
1760				break;
1761			}
1762			err = -ENOBUFS;
1763			goto failure;
1764		}
1765		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1766		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1767		err = -EAGAIN;
1768		if (!timeo)
1769			goto failure;
1770		if (signal_pending(current))
1771			goto interrupted;
1772		timeo = sock_wait_for_wmem(sk, timeo);
1773	}
1774
1775	skb_set_owner_w(skb, sk);
1776	return skb;
1777
1778interrupted:
1779	err = sock_intr_errno(timeo);
1780failure:
1781	*errcode = err;
1782	return NULL;
1783}
1784EXPORT_SYMBOL(sock_alloc_send_pskb);
1785
1786struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1787				    int noblock, int *errcode)
1788{
1789	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1790}
1791EXPORT_SYMBOL(sock_alloc_send_skb);
1792
1793/* On 32bit arches, an skb frag is limited to 2^15 */
1794#define SKB_FRAG_PAGE_ORDER	get_order(32768)
1795
1796bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1797{
1798	int order;
1799
1800	if (pfrag->page) {
1801		if (atomic_read(&pfrag->page->_count) == 1) {
1802			pfrag->offset = 0;
1803			return true;
1804		}
1805		if (pfrag->offset < pfrag->size)
1806			return true;
1807		put_page(pfrag->page);
1808	}
1809
1810	/* We restrict high order allocations to users that can afford to wait */
1811	order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1812
1813	do {
1814		gfp_t gfp = sk->sk_allocation;
1815
1816		if (order)
1817			gfp |= __GFP_COMP | __GFP_NOWARN;
1818		pfrag->page = alloc_pages(gfp, order);
1819		if (likely(pfrag->page)) {
1820			pfrag->offset = 0;
1821			pfrag->size = PAGE_SIZE << order;
1822			return true;
1823		}
1824	} while (--order >= 0);
1825
1826	sk_enter_memory_pressure(sk);
1827	sk_stream_moderate_sndbuf(sk);
1828	return false;
1829}
1830EXPORT_SYMBOL(sk_page_frag_refill);
1831
1832static void __lock_sock(struct sock *sk)
1833	__releases(&sk->sk_lock.slock)
1834	__acquires(&sk->sk_lock.slock)
1835{
1836	DEFINE_WAIT(wait);
1837
1838	for (;;) {
1839		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1840					TASK_UNINTERRUPTIBLE);
1841		spin_unlock_bh(&sk->sk_lock.slock);
1842		schedule();
1843		spin_lock_bh(&sk->sk_lock.slock);
1844		if (!sock_owned_by_user(sk))
1845			break;
1846	}
1847	finish_wait(&sk->sk_lock.wq, &wait);
1848}
1849
1850static void __release_sock(struct sock *sk)
1851	__releases(&sk->sk_lock.slock)
1852	__acquires(&sk->sk_lock.slock)
1853{
1854	struct sk_buff *skb = sk->sk_backlog.head;
1855
1856	do {
1857		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1858		bh_unlock_sock(sk);
1859
1860		do {
1861			struct sk_buff *next = skb->next;
1862
1863			prefetch(next);
1864			WARN_ON_ONCE(skb_dst_is_noref(skb));
1865			skb->next = NULL;
1866			sk_backlog_rcv(sk, skb);
1867
1868			/*
1869			 * We are in process context here with softirqs
1870			 * disabled, use cond_resched_softirq() to preempt.
1871			 * This is safe to do because we've taken the backlog
1872			 * queue private:
1873			 */
1874			cond_resched_softirq();
1875
1876			skb = next;
1877		} while (skb != NULL);
1878
1879		bh_lock_sock(sk);
1880	} while ((skb = sk->sk_backlog.head) != NULL);
1881
1882	/*
1883	 * Doing the zeroing here guarantee we can not loop forever
1884	 * while a wild producer attempts to flood us.
1885	 */
1886	sk->sk_backlog.len = 0;
1887}
1888
1889/**
1890 * sk_wait_data - wait for data to arrive at sk_receive_queue
1891 * @sk:    sock to wait on
1892 * @timeo: for how long
1893 *
1894 * Now socket state including sk->sk_err is changed only under lock,
1895 * hence we may omit checks after joining wait queue.
1896 * We check receive queue before schedule() only as optimization;
1897 * it is very likely that release_sock() added new data.
1898 */
1899int sk_wait_data(struct sock *sk, long *timeo)
1900{
1901	int rc;
1902	DEFINE_WAIT(wait);
1903
1904	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1905	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1906	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1907	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1908	finish_wait(sk_sleep(sk), &wait);
1909	return rc;
1910}
1911EXPORT_SYMBOL(sk_wait_data);
1912
1913/**
1914 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1915 *	@sk: socket
1916 *	@size: memory size to allocate
1917 *	@kind: allocation type
1918 *
1919 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1920 *	rmem allocation. This function assumes that protocols which have
1921 *	memory_pressure use sk_wmem_queued as write buffer accounting.
1922 */
1923int __sk_mem_schedule(struct sock *sk, int size, int kind)
1924{
1925	struct proto *prot = sk->sk_prot;
1926	int amt = sk_mem_pages(size);
1927	long allocated;
1928	int parent_status = UNDER_LIMIT;
1929
1930	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1931
1932	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1933
1934	/* Under limit. */
1935	if (parent_status == UNDER_LIMIT &&
1936			allocated <= sk_prot_mem_limits(sk, 0)) {
1937		sk_leave_memory_pressure(sk);
1938		return 1;
1939	}
1940
1941	/* Under pressure. (we or our parents) */
1942	if ((parent_status > SOFT_LIMIT) ||
1943			allocated > sk_prot_mem_limits(sk, 1))
1944		sk_enter_memory_pressure(sk);
1945
1946	/* Over hard limit (we or our parents) */
1947	if ((parent_status == OVER_LIMIT) ||
1948			(allocated > sk_prot_mem_limits(sk, 2)))
1949		goto suppress_allocation;
1950
1951	/* guarantee minimum buffer size under pressure */
1952	if (kind == SK_MEM_RECV) {
1953		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1954			return 1;
1955
1956	} else { /* SK_MEM_SEND */
1957		if (sk->sk_type == SOCK_STREAM) {
1958			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1959				return 1;
1960		} else if (atomic_read(&sk->sk_wmem_alloc) <
1961			   prot->sysctl_wmem[0])
1962				return 1;
1963	}
1964
1965	if (sk_has_memory_pressure(sk)) {
1966		int alloc;
1967
1968		if (!sk_under_memory_pressure(sk))
1969			return 1;
1970		alloc = sk_sockets_allocated_read_positive(sk);
1971		if (sk_prot_mem_limits(sk, 2) > alloc *
1972		    sk_mem_pages(sk->sk_wmem_queued +
1973				 atomic_read(&sk->sk_rmem_alloc) +
1974				 sk->sk_forward_alloc))
1975			return 1;
1976	}
1977
1978suppress_allocation:
1979
1980	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1981		sk_stream_moderate_sndbuf(sk);
1982
1983		/* Fail only if socket is _under_ its sndbuf.
1984		 * In this case we cannot block, so that we have to fail.
1985		 */
1986		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1987			return 1;
1988	}
1989
1990	trace_sock_exceed_buf_limit(sk, prot, allocated);
1991
1992	/* Alas. Undo changes. */
1993	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1994
1995	sk_memory_allocated_sub(sk, amt);
1996
1997	return 0;
1998}
1999EXPORT_SYMBOL(__sk_mem_schedule);
2000
2001/**
2002 *	__sk_reclaim - reclaim memory_allocated
2003 *	@sk: socket
2004 */
2005void __sk_mem_reclaim(struct sock *sk)
2006{
2007	sk_memory_allocated_sub(sk,
2008				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2009	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2010
2011	if (sk_under_memory_pressure(sk) &&
2012	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2013		sk_leave_memory_pressure(sk);
2014}
2015EXPORT_SYMBOL(__sk_mem_reclaim);
2016
2017
2018/*
2019 * Set of default routines for initialising struct proto_ops when
2020 * the protocol does not support a particular function. In certain
2021 * cases where it makes no sense for a protocol to have a "do nothing"
2022 * function, some default processing is provided.
2023 */
2024
2025int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2026{
2027	return -EOPNOTSUPP;
2028}
2029EXPORT_SYMBOL(sock_no_bind);
2030
2031int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2032		    int len, int flags)
2033{
2034	return -EOPNOTSUPP;
2035}
2036EXPORT_SYMBOL(sock_no_connect);
2037
2038int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2039{
2040	return -EOPNOTSUPP;
2041}
2042EXPORT_SYMBOL(sock_no_socketpair);
2043
2044int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2045{
2046	return -EOPNOTSUPP;
2047}
2048EXPORT_SYMBOL(sock_no_accept);
2049
2050int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2051		    int *len, int peer)
2052{
2053	return -EOPNOTSUPP;
2054}
2055EXPORT_SYMBOL(sock_no_getname);
2056
2057unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2058{
2059	return 0;
2060}
2061EXPORT_SYMBOL(sock_no_poll);
2062
2063int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2064{
2065	return -EOPNOTSUPP;
2066}
2067EXPORT_SYMBOL(sock_no_ioctl);
2068
2069int sock_no_listen(struct socket *sock, int backlog)
2070{
2071	return -EOPNOTSUPP;
2072}
2073EXPORT_SYMBOL(sock_no_listen);
2074
2075int sock_no_shutdown(struct socket *sock, int how)
2076{
2077	return -EOPNOTSUPP;
2078}
2079EXPORT_SYMBOL(sock_no_shutdown);
2080
2081int sock_no_setsockopt(struct socket *sock, int level, int optname,
2082		    char __user *optval, unsigned int optlen)
2083{
2084	return -EOPNOTSUPP;
2085}
2086EXPORT_SYMBOL(sock_no_setsockopt);
2087
2088int sock_no_getsockopt(struct socket *sock, int level, int optname,
2089		    char __user *optval, int __user *optlen)
2090{
2091	return -EOPNOTSUPP;
2092}
2093EXPORT_SYMBOL(sock_no_getsockopt);
2094
2095int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2096		    size_t len)
2097{
2098	return -EOPNOTSUPP;
2099}
2100EXPORT_SYMBOL(sock_no_sendmsg);
2101
2102int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2103		    size_t len, int flags)
2104{
2105	return -EOPNOTSUPP;
2106}
2107EXPORT_SYMBOL(sock_no_recvmsg);
2108
2109int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2110{
2111	/* Mirror missing mmap method error code */
2112	return -ENODEV;
2113}
2114EXPORT_SYMBOL(sock_no_mmap);
2115
2116ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2117{
2118	ssize_t res;
2119	struct msghdr msg = {.msg_flags = flags};
2120	struct kvec iov;
2121	char *kaddr = kmap(page);
2122	iov.iov_base = kaddr + offset;
2123	iov.iov_len = size;
2124	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2125	kunmap(page);
2126	return res;
2127}
2128EXPORT_SYMBOL(sock_no_sendpage);
2129
2130/*
2131 *	Default Socket Callbacks
2132 */
2133
2134static void sock_def_wakeup(struct sock *sk)
2135{
2136	struct socket_wq *wq;
2137
2138	rcu_read_lock();
2139	wq = rcu_dereference(sk->sk_wq);
2140	if (wq_has_sleeper(wq))
2141		wake_up_interruptible_all(&wq->wait);
2142	rcu_read_unlock();
2143}
2144
2145static void sock_def_error_report(struct sock *sk)
2146{
2147	struct socket_wq *wq;
2148
2149	rcu_read_lock();
2150	wq = rcu_dereference(sk->sk_wq);
2151	if (wq_has_sleeper(wq))
2152		wake_up_interruptible_poll(&wq->wait, POLLERR);
2153	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2154	rcu_read_unlock();
2155}
2156
2157static void sock_def_readable(struct sock *sk, int len)
2158{
2159	struct socket_wq *wq;
2160
2161	rcu_read_lock();
2162	wq = rcu_dereference(sk->sk_wq);
2163	if (wq_has_sleeper(wq))
2164		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2165						POLLRDNORM | POLLRDBAND);
2166	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2167	rcu_read_unlock();
2168}
2169
2170static void sock_def_write_space(struct sock *sk)
2171{
2172	struct socket_wq *wq;
2173
2174	rcu_read_lock();
2175
2176	/* Do not wake up a writer until he can make "significant"
2177	 * progress.  --DaveM
2178	 */
2179	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2180		wq = rcu_dereference(sk->sk_wq);
2181		if (wq_has_sleeper(wq))
2182			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2183						POLLWRNORM | POLLWRBAND);
2184
2185		/* Should agree with poll, otherwise some programs break */
2186		if (sock_writeable(sk))
2187			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2188	}
2189
2190	rcu_read_unlock();
2191}
2192
2193static void sock_def_destruct(struct sock *sk)
2194{
2195	kfree(sk->sk_protinfo);
2196}
2197
2198void sk_send_sigurg(struct sock *sk)
2199{
2200	if (sk->sk_socket && sk->sk_socket->file)
2201		if (send_sigurg(&sk->sk_socket->file->f_owner))
2202			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2203}
2204EXPORT_SYMBOL(sk_send_sigurg);
2205
2206void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2207		    unsigned long expires)
2208{
2209	if (!mod_timer(timer, expires))
2210		sock_hold(sk);
2211}
2212EXPORT_SYMBOL(sk_reset_timer);
2213
2214void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2215{
2216	if (del_timer(timer))
2217		__sock_put(sk);
2218}
2219EXPORT_SYMBOL(sk_stop_timer);
2220
2221void sock_init_data(struct socket *sock, struct sock *sk)
2222{
2223	skb_queue_head_init(&sk->sk_receive_queue);
2224	skb_queue_head_init(&sk->sk_write_queue);
2225	skb_queue_head_init(&sk->sk_error_queue);
2226#ifdef CONFIG_NET_DMA
2227	skb_queue_head_init(&sk->sk_async_wait_queue);
2228#endif
2229
2230	sk->sk_send_head	=	NULL;
2231
2232	init_timer(&sk->sk_timer);
2233
2234	sk->sk_allocation	=	GFP_KERNEL;
2235	sk->sk_rcvbuf		=	sysctl_rmem_default;
2236	sk->sk_sndbuf		=	sysctl_wmem_default;
2237	sk->sk_state		=	TCP_CLOSE;
2238	sk_set_socket(sk, sock);
2239
2240	sock_set_flag(sk, SOCK_ZAPPED);
2241
2242	if (sock) {
2243		sk->sk_type	=	sock->type;
2244		sk->sk_wq	=	sock->wq;
2245		sock->sk	=	sk;
2246	} else
2247		sk->sk_wq	=	NULL;
2248
2249	spin_lock_init(&sk->sk_dst_lock);
2250	rwlock_init(&sk->sk_callback_lock);
2251	lockdep_set_class_and_name(&sk->sk_callback_lock,
2252			af_callback_keys + sk->sk_family,
2253			af_family_clock_key_strings[sk->sk_family]);
2254
2255	sk->sk_state_change	=	sock_def_wakeup;
2256	sk->sk_data_ready	=	sock_def_readable;
2257	sk->sk_write_space	=	sock_def_write_space;
2258	sk->sk_error_report	=	sock_def_error_report;
2259	sk->sk_destruct		=	sock_def_destruct;
2260
2261	sk->sk_frag.page	=	NULL;
2262	sk->sk_frag.offset	=	0;
2263	sk->sk_peek_off		=	-1;
2264
2265	sk->sk_peer_pid 	=	NULL;
2266	sk->sk_peer_cred	=	NULL;
2267	sk->sk_write_pending	=	0;
2268	sk->sk_rcvlowat		=	1;
2269	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2270	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2271
2272	sk->sk_stamp = ktime_set(-1L, 0);
2273
2274	/*
2275	 * Before updating sk_refcnt, we must commit prior changes to memory
2276	 * (Documentation/RCU/rculist_nulls.txt for details)
2277	 */
2278	smp_wmb();
2279	atomic_set(&sk->sk_refcnt, 1);
2280	atomic_set(&sk->sk_drops, 0);
2281}
2282EXPORT_SYMBOL(sock_init_data);
2283
2284void lock_sock_nested(struct sock *sk, int subclass)
2285{
2286	might_sleep();
2287	spin_lock_bh(&sk->sk_lock.slock);
2288	if (sk->sk_lock.owned)
2289		__lock_sock(sk);
2290	sk->sk_lock.owned = 1;
2291	spin_unlock(&sk->sk_lock.slock);
2292	/*
2293	 * The sk_lock has mutex_lock() semantics here:
2294	 */
2295	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2296	local_bh_enable();
2297}
2298EXPORT_SYMBOL(lock_sock_nested);
2299
2300void release_sock(struct sock *sk)
2301{
2302	/*
2303	 * The sk_lock has mutex_unlock() semantics:
2304	 */
2305	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2306
2307	spin_lock_bh(&sk->sk_lock.slock);
2308	if (sk->sk_backlog.tail)
2309		__release_sock(sk);
2310
2311	if (sk->sk_prot->release_cb)
2312		sk->sk_prot->release_cb(sk);
2313
2314	sk->sk_lock.owned = 0;
2315	if (waitqueue_active(&sk->sk_lock.wq))
2316		wake_up(&sk->sk_lock.wq);
2317	spin_unlock_bh(&sk->sk_lock.slock);
2318}
2319EXPORT_SYMBOL(release_sock);
2320
2321/**
2322 * lock_sock_fast - fast version of lock_sock
2323 * @sk: socket
2324 *
2325 * This version should be used for very small section, where process wont block
2326 * return false if fast path is taken
2327 *   sk_lock.slock locked, owned = 0, BH disabled
2328 * return true if slow path is taken
2329 *   sk_lock.slock unlocked, owned = 1, BH enabled
2330 */
2331bool lock_sock_fast(struct sock *sk)
2332{
2333	might_sleep();
2334	spin_lock_bh(&sk->sk_lock.slock);
2335
2336	if (!sk->sk_lock.owned)
2337		/*
2338		 * Note : We must disable BH
2339		 */
2340		return false;
2341
2342	__lock_sock(sk);
2343	sk->sk_lock.owned = 1;
2344	spin_unlock(&sk->sk_lock.slock);
2345	/*
2346	 * The sk_lock has mutex_lock() semantics here:
2347	 */
2348	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2349	local_bh_enable();
2350	return true;
2351}
2352EXPORT_SYMBOL(lock_sock_fast);
2353
2354int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2355{
2356	struct timeval tv;
2357	if (!sock_flag(sk, SOCK_TIMESTAMP))
2358		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2359	tv = ktime_to_timeval(sk->sk_stamp);
2360	if (tv.tv_sec == -1)
2361		return -ENOENT;
2362	if (tv.tv_sec == 0) {
2363		sk->sk_stamp = ktime_get_real();
2364		tv = ktime_to_timeval(sk->sk_stamp);
2365	}
2366	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2367}
2368EXPORT_SYMBOL(sock_get_timestamp);
2369
2370int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2371{
2372	struct timespec ts;
2373	if (!sock_flag(sk, SOCK_TIMESTAMP))
2374		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2375	ts = ktime_to_timespec(sk->sk_stamp);
2376	if (ts.tv_sec == -1)
2377		return -ENOENT;
2378	if (ts.tv_sec == 0) {
2379		sk->sk_stamp = ktime_get_real();
2380		ts = ktime_to_timespec(sk->sk_stamp);
2381	}
2382	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2383}
2384EXPORT_SYMBOL(sock_get_timestampns);
2385
2386void sock_enable_timestamp(struct sock *sk, int flag)
2387{
2388	if (!sock_flag(sk, flag)) {
2389		unsigned long previous_flags = sk->sk_flags;
2390
2391		sock_set_flag(sk, flag);
2392		/*
2393		 * we just set one of the two flags which require net
2394		 * time stamping, but time stamping might have been on
2395		 * already because of the other one
2396		 */
2397		if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2398			net_enable_timestamp();
2399	}
2400}
2401
2402/*
2403 *	Get a socket option on an socket.
2404 *
2405 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2406 *	asynchronous errors should be reported by getsockopt. We assume
2407 *	this means if you specify SO_ERROR (otherwise whats the point of it).
2408 */
2409int sock_common_getsockopt(struct socket *sock, int level, int optname,
2410			   char __user *optval, int __user *optlen)
2411{
2412	struct sock *sk = sock->sk;
2413
2414	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2415}
2416EXPORT_SYMBOL(sock_common_getsockopt);
2417
2418#ifdef CONFIG_COMPAT
2419int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2420				  char __user *optval, int __user *optlen)
2421{
2422	struct sock *sk = sock->sk;
2423
2424	if (sk->sk_prot->compat_getsockopt != NULL)
2425		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2426						      optval, optlen);
2427	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2428}
2429EXPORT_SYMBOL(compat_sock_common_getsockopt);
2430#endif
2431
2432int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2433			struct msghdr *msg, size_t size, int flags)
2434{
2435	struct sock *sk = sock->sk;
2436	int addr_len = 0;
2437	int err;
2438
2439	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2440				   flags & ~MSG_DONTWAIT, &addr_len);
2441	if (err >= 0)
2442		msg->msg_namelen = addr_len;
2443	return err;
2444}
2445EXPORT_SYMBOL(sock_common_recvmsg);
2446
2447/*
2448 *	Set socket options on an inet socket.
2449 */
2450int sock_common_setsockopt(struct socket *sock, int level, int optname,
2451			   char __user *optval, unsigned int optlen)
2452{
2453	struct sock *sk = sock->sk;
2454
2455	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2456}
2457EXPORT_SYMBOL(sock_common_setsockopt);
2458
2459#ifdef CONFIG_COMPAT
2460int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2461				  char __user *optval, unsigned int optlen)
2462{
2463	struct sock *sk = sock->sk;
2464
2465	if (sk->sk_prot->compat_setsockopt != NULL)
2466		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2467						      optval, optlen);
2468	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2469}
2470EXPORT_SYMBOL(compat_sock_common_setsockopt);
2471#endif
2472
2473void sk_common_release(struct sock *sk)
2474{
2475	if (sk->sk_prot->destroy)
2476		sk->sk_prot->destroy(sk);
2477
2478	/*
2479	 * Observation: when sock_common_release is called, processes have
2480	 * no access to socket. But net still has.
2481	 * Step one, detach it from networking:
2482	 *
2483	 * A. Remove from hash tables.
2484	 */
2485
2486	sk->sk_prot->unhash(sk);
2487
2488	/*
2489	 * In this point socket cannot receive new packets, but it is possible
2490	 * that some packets are in flight because some CPU runs receiver and
2491	 * did hash table lookup before we unhashed socket. They will achieve
2492	 * receive queue and will be purged by socket destructor.
2493	 *
2494	 * Also we still have packets pending on receive queue and probably,
2495	 * our own packets waiting in device queues. sock_destroy will drain
2496	 * receive queue, but transmitted packets will delay socket destruction
2497	 * until the last reference will be released.
2498	 */
2499
2500	sock_orphan(sk);
2501
2502	xfrm_sk_free_policy(sk);
2503
2504	sk_refcnt_debug_release(sk);
2505
2506	if (sk->sk_frag.page) {
2507		put_page(sk->sk_frag.page);
2508		sk->sk_frag.page = NULL;
2509	}
2510
2511	sock_put(sk);
2512}
2513EXPORT_SYMBOL(sk_common_release);
2514
2515#ifdef CONFIG_PROC_FS
2516#define PROTO_INUSE_NR	64	/* should be enough for the first time */
2517struct prot_inuse {
2518	int val[PROTO_INUSE_NR];
2519};
2520
2521static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2522
2523#ifdef CONFIG_NET_NS
2524void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2525{
2526	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2527}
2528EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2529
2530int sock_prot_inuse_get(struct net *net, struct proto *prot)
2531{
2532	int cpu, idx = prot->inuse_idx;
2533	int res = 0;
2534
2535	for_each_possible_cpu(cpu)
2536		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2537
2538	return res >= 0 ? res : 0;
2539}
2540EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2541
2542static int __net_init sock_inuse_init_net(struct net *net)
2543{
2544	net->core.inuse = alloc_percpu(struct prot_inuse);
2545	return net->core.inuse ? 0 : -ENOMEM;
2546}
2547
2548static void __net_exit sock_inuse_exit_net(struct net *net)
2549{
2550	free_percpu(net->core.inuse);
2551}
2552
2553static struct pernet_operations net_inuse_ops = {
2554	.init = sock_inuse_init_net,
2555	.exit = sock_inuse_exit_net,
2556};
2557
2558static __init int net_inuse_init(void)
2559{
2560	if (register_pernet_subsys(&net_inuse_ops))
2561		panic("Cannot initialize net inuse counters");
2562
2563	return 0;
2564}
2565
2566core_initcall(net_inuse_init);
2567#else
2568static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2569
2570void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2571{
2572	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2573}
2574EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2575
2576int sock_prot_inuse_get(struct net *net, struct proto *prot)
2577{
2578	int cpu, idx = prot->inuse_idx;
2579	int res = 0;
2580
2581	for_each_possible_cpu(cpu)
2582		res += per_cpu(prot_inuse, cpu).val[idx];
2583
2584	return res >= 0 ? res : 0;
2585}
2586EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2587#endif
2588
2589static void assign_proto_idx(struct proto *prot)
2590{
2591	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2592
2593	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2594		pr_err("PROTO_INUSE_NR exhausted\n");
2595		return;
2596	}
2597
2598	set_bit(prot->inuse_idx, proto_inuse_idx);
2599}
2600
2601static void release_proto_idx(struct proto *prot)
2602{
2603	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2604		clear_bit(prot->inuse_idx, proto_inuse_idx);
2605}
2606#else
2607static inline void assign_proto_idx(struct proto *prot)
2608{
2609}
2610
2611static inline void release_proto_idx(struct proto *prot)
2612{
2613}
2614#endif
2615
2616int proto_register(struct proto *prot, int alloc_slab)
2617{
2618	if (alloc_slab) {
2619		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2620					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2621					NULL);
2622
2623		if (prot->slab == NULL) {
2624			pr_crit("%s: Can't create sock SLAB cache!\n",
2625				prot->name);
2626			goto out;
2627		}
2628
2629		if (prot->rsk_prot != NULL) {
2630			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2631			if (prot->rsk_prot->slab_name == NULL)
2632				goto out_free_sock_slab;
2633
2634			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2635								 prot->rsk_prot->obj_size, 0,
2636								 SLAB_HWCACHE_ALIGN, NULL);
2637
2638			if (prot->rsk_prot->slab == NULL) {
2639				pr_crit("%s: Can't create request sock SLAB cache!\n",
2640					prot->name);
2641				goto out_free_request_sock_slab_name;
2642			}
2643		}
2644
2645		if (prot->twsk_prot != NULL) {
2646			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2647
2648			if (prot->twsk_prot->twsk_slab_name == NULL)
2649				goto out_free_request_sock_slab;
2650
2651			prot->twsk_prot->twsk_slab =
2652				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2653						  prot->twsk_prot->twsk_obj_size,
2654						  0,
2655						  SLAB_HWCACHE_ALIGN |
2656							prot->slab_flags,
2657						  NULL);
2658			if (prot->twsk_prot->twsk_slab == NULL)
2659				goto out_free_timewait_sock_slab_name;
2660		}
2661	}
2662
2663	mutex_lock(&proto_list_mutex);
2664	list_add(&prot->node, &proto_list);
2665	assign_proto_idx(prot);
2666	mutex_unlock(&proto_list_mutex);
2667	return 0;
2668
2669out_free_timewait_sock_slab_name:
2670	kfree(prot->twsk_prot->twsk_slab_name);
2671out_free_request_sock_slab:
2672	if (prot->rsk_prot && prot->rsk_prot->slab) {
2673		kmem_cache_destroy(prot->rsk_prot->slab);
2674		prot->rsk_prot->slab = NULL;
2675	}
2676out_free_request_sock_slab_name:
2677	if (prot->rsk_prot)
2678		kfree(prot->rsk_prot->slab_name);
2679out_free_sock_slab:
2680	kmem_cache_destroy(prot->slab);
2681	prot->slab = NULL;
2682out:
2683	return -ENOBUFS;
2684}
2685EXPORT_SYMBOL(proto_register);
2686
2687void proto_unregister(struct proto *prot)
2688{
2689	mutex_lock(&proto_list_mutex);
2690	release_proto_idx(prot);
2691	list_del(&prot->node);
2692	mutex_unlock(&proto_list_mutex);
2693
2694	if (prot->slab != NULL) {
2695		kmem_cache_destroy(prot->slab);
2696		prot->slab = NULL;
2697	}
2698
2699	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2700		kmem_cache_destroy(prot->rsk_prot->slab);
2701		kfree(prot->rsk_prot->slab_name);
2702		prot->rsk_prot->slab = NULL;
2703	}
2704
2705	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2706		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2707		kfree(prot->twsk_prot->twsk_slab_name);
2708		prot->twsk_prot->twsk_slab = NULL;
2709	}
2710}
2711EXPORT_SYMBOL(proto_unregister);
2712
2713#ifdef CONFIG_PROC_FS
2714static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2715	__acquires(proto_list_mutex)
2716{
2717	mutex_lock(&proto_list_mutex);
2718	return seq_list_start_head(&proto_list, *pos);
2719}
2720
2721static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2722{
2723	return seq_list_next(v, &proto_list, pos);
2724}
2725
2726static void proto_seq_stop(struct seq_file *seq, void *v)
2727	__releases(proto_list_mutex)
2728{
2729	mutex_unlock(&proto_list_mutex);
2730}
2731
2732static char proto_method_implemented(const void *method)
2733{
2734	return method == NULL ? 'n' : 'y';
2735}
2736static long sock_prot_memory_allocated(struct proto *proto)
2737{
2738	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2739}
2740
2741static char *sock_prot_memory_pressure(struct proto *proto)
2742{
2743	return proto->memory_pressure != NULL ?
2744	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2745}
2746
2747static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2748{
2749
2750	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2751			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2752		   proto->name,
2753		   proto->obj_size,
2754		   sock_prot_inuse_get(seq_file_net(seq), proto),
2755		   sock_prot_memory_allocated(proto),
2756		   sock_prot_memory_pressure(proto),
2757		   proto->max_header,
2758		   proto->slab == NULL ? "no" : "yes",
2759		   module_name(proto->owner),
2760		   proto_method_implemented(proto->close),
2761		   proto_method_implemented(proto->connect),
2762		   proto_method_implemented(proto->disconnect),
2763		   proto_method_implemented(proto->accept),
2764		   proto_method_implemented(proto->ioctl),
2765		   proto_method_implemented(proto->init),
2766		   proto_method_implemented(proto->destroy),
2767		   proto_method_implemented(proto->shutdown),
2768		   proto_method_implemented(proto->setsockopt),
2769		   proto_method_implemented(proto->getsockopt),
2770		   proto_method_implemented(proto->sendmsg),
2771		   proto_method_implemented(proto->recvmsg),
2772		   proto_method_implemented(proto->sendpage),
2773		   proto_method_implemented(proto->bind),
2774		   proto_method_implemented(proto->backlog_rcv),
2775		   proto_method_implemented(proto->hash),
2776		   proto_method_implemented(proto->unhash),
2777		   proto_method_implemented(proto->get_port),
2778		   proto_method_implemented(proto->enter_memory_pressure));
2779}
2780
2781static int proto_seq_show(struct seq_file *seq, void *v)
2782{
2783	if (v == &proto_list)
2784		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2785			   "protocol",
2786			   "size",
2787			   "sockets",
2788			   "memory",
2789			   "press",
2790			   "maxhdr",
2791			   "slab",
2792			   "module",
2793			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2794	else
2795		proto_seq_printf(seq, list_entry(v, struct proto, node));
2796	return 0;
2797}
2798
2799static const struct seq_operations proto_seq_ops = {
2800	.start  = proto_seq_start,
2801	.next   = proto_seq_next,
2802	.stop   = proto_seq_stop,
2803	.show   = proto_seq_show,
2804};
2805
2806static int proto_seq_open(struct inode *inode, struct file *file)
2807{
2808	return seq_open_net(inode, file, &proto_seq_ops,
2809			    sizeof(struct seq_net_private));
2810}
2811
2812static const struct file_operations proto_seq_fops = {
2813	.owner		= THIS_MODULE,
2814	.open		= proto_seq_open,
2815	.read		= seq_read,
2816	.llseek		= seq_lseek,
2817	.release	= seq_release_net,
2818};
2819
2820static __net_init int proto_init_net(struct net *net)
2821{
2822	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2823		return -ENOMEM;
2824
2825	return 0;
2826}
2827
2828static __net_exit void proto_exit_net(struct net *net)
2829{
2830	remove_proc_entry("protocols", net->proc_net);
2831}
2832
2833
2834static __net_initdata struct pernet_operations proto_net_ops = {
2835	.init = proto_init_net,
2836	.exit = proto_exit_net,
2837};
2838
2839static int __init proto_init(void)
2840{
2841	return register_pernet_subsys(&proto_net_ops);
2842}
2843
2844subsys_initcall(proto_init);
2845
2846#endif /* PROC_FS */
2847