sock.c revision 2d48d67fa8cd129ea85ea02d91b4a793286866f8
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 *		Alan Cox	: 	Numerous verify_area() problems
17 *		Alan Cox	:	Connecting on a connecting socket
18 *					now returns an error for tcp.
19 *		Alan Cox	:	sock->protocol is set correctly.
20 *					and is not sometimes left as 0.
21 *		Alan Cox	:	connect handles icmp errors on a
22 *					connect properly. Unfortunately there
23 *					is a restart syscall nasty there. I
24 *					can't match BSD without hacking the C
25 *					library. Ideas urgently sought!
26 *		Alan Cox	:	Disallow bind() to addresses that are
27 *					not ours - especially broadcast ones!!
28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30 *					instead they leave that for the DESTROY timer.
31 *		Alan Cox	:	Clean up error flag in accept
32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33 *					was buggy. Put a remove_sock() in the handler
34 *					for memory when we hit 0. Also altered the timer
35 *					code. The ACK stuff can wait and needs major
36 *					TCP layer surgery.
37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38 *					and fixed timer/inet_bh race.
39 *		Alan Cox	:	Added zapped flag for TCP
40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47 *	Pauline Middelink	:	identd support
48 *		Alan Cox	:	Fixed connect() taking signals I think.
49 *		Alan Cox	:	SO_LINGER supported
50 *		Alan Cox	:	Error reporting fixes
51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52 *		Alan Cox	:	inet sockets don't set sk->type!
53 *		Alan Cox	:	Split socket option code
54 *		Alan Cox	:	Callbacks
55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56 *		Alex		:	Removed restriction on inet fioctl
57 *		Alan Cox	:	Splitting INET from NET core
58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60 *		Alan Cox	:	Split IP from generic code
61 *		Alan Cox	:	New kfree_skbmem()
62 *		Alan Cox	:	Make SO_DEBUG superuser only.
63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64 *					(compatibility fix)
65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66 *		Alan Cox	:	Allocator for a socket is settable.
67 *		Alan Cox	:	SO_ERROR includes soft errors.
68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69 *		Alan Cox	: 	Generic socket allocation to make hooks
70 *					easier (suggested by Craig Metz).
71 *		Michael Pall	:	SO_ERROR returns positive errno again
72 *              Steve Whitehouse:       Added default destructor to free
73 *                                      protocol private data.
74 *              Steve Whitehouse:       Added various other default routines
75 *                                      common to several socket families.
76 *              Chris Evans     :       Call suser() check last on F_SETOWN
77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79 *		Andi Kleen	:	Fix write_space callback
80 *		Chris Evans	:	Security fixes - signedness again
81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 *		This program is free software; you can redistribute it and/or
87 *		modify it under the terms of the GNU General Public License
88 *		as published by the Free Software Foundation; either version
89 *		2 of the License, or (at your option) any later version.
90 */
91
92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
94#include <linux/capability.h>
95#include <linux/errno.h>
96#include <linux/types.h>
97#include <linux/socket.h>
98#include <linux/in.h>
99#include <linux/kernel.h>
100#include <linux/module.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/sched.h>
104#include <linux/timer.h>
105#include <linux/string.h>
106#include <linux/sockios.h>
107#include <linux/net.h>
108#include <linux/mm.h>
109#include <linux/slab.h>
110#include <linux/interrupt.h>
111#include <linux/poll.h>
112#include <linux/tcp.h>
113#include <linux/init.h>
114#include <linux/highmem.h>
115#include <linux/user_namespace.h>
116#include <linux/static_key.h>
117#include <linux/memcontrol.h>
118#include <linux/prefetch.h>
119
120#include <asm/uaccess.h>
121
122#include <linux/netdevice.h>
123#include <net/protocol.h>
124#include <linux/skbuff.h>
125#include <net/net_namespace.h>
126#include <net/request_sock.h>
127#include <net/sock.h>
128#include <linux/net_tstamp.h>
129#include <net/xfrm.h>
130#include <linux/ipsec.h>
131#include <net/cls_cgroup.h>
132#include <net/netprio_cgroup.h>
133
134#include <linux/filter.h>
135
136#include <trace/events/sock.h>
137
138#ifdef CONFIG_INET
139#include <net/tcp.h>
140#endif
141
142#include <net/ll_poll.h>
143
144static DEFINE_MUTEX(proto_list_mutex);
145static LIST_HEAD(proto_list);
146
147#ifdef CONFIG_MEMCG_KMEM
148int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
149{
150	struct proto *proto;
151	int ret = 0;
152
153	mutex_lock(&proto_list_mutex);
154	list_for_each_entry(proto, &proto_list, node) {
155		if (proto->init_cgroup) {
156			ret = proto->init_cgroup(memcg, ss);
157			if (ret)
158				goto out;
159		}
160	}
161
162	mutex_unlock(&proto_list_mutex);
163	return ret;
164out:
165	list_for_each_entry_continue_reverse(proto, &proto_list, node)
166		if (proto->destroy_cgroup)
167			proto->destroy_cgroup(memcg);
168	mutex_unlock(&proto_list_mutex);
169	return ret;
170}
171
172void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
173{
174	struct proto *proto;
175
176	mutex_lock(&proto_list_mutex);
177	list_for_each_entry_reverse(proto, &proto_list, node)
178		if (proto->destroy_cgroup)
179			proto->destroy_cgroup(memcg);
180	mutex_unlock(&proto_list_mutex);
181}
182#endif
183
184/*
185 * Each address family might have different locking rules, so we have
186 * one slock key per address family:
187 */
188static struct lock_class_key af_family_keys[AF_MAX];
189static struct lock_class_key af_family_slock_keys[AF_MAX];
190
191#if defined(CONFIG_MEMCG_KMEM)
192struct static_key memcg_socket_limit_enabled;
193EXPORT_SYMBOL(memcg_socket_limit_enabled);
194#endif
195
196/*
197 * Make lock validator output more readable. (we pre-construct these
198 * strings build-time, so that runtime initialization of socket
199 * locks is fast):
200 */
201static const char *const af_family_key_strings[AF_MAX+1] = {
202  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
203  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
204  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
205  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
206  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
207  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
208  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
209  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
210  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
211  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
212  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
213  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
214  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
215  "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
216};
217static const char *const af_family_slock_key_strings[AF_MAX+1] = {
218  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
219  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
220  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
221  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
222  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
223  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
224  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
225  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
226  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
227  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
228  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
229  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
230  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
231  "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
232};
233static const char *const af_family_clock_key_strings[AF_MAX+1] = {
234  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
235  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
236  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
237  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
238  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
239  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
240  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
241  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
242  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
243  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
244  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
245  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
246  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
247  "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
248};
249
250/*
251 * sk_callback_lock locking rules are per-address-family,
252 * so split the lock classes by using a per-AF key:
253 */
254static struct lock_class_key af_callback_keys[AF_MAX];
255
256/* Take into consideration the size of the struct sk_buff overhead in the
257 * determination of these values, since that is non-constant across
258 * platforms.  This makes socket queueing behavior and performance
259 * not depend upon such differences.
260 */
261#define _SK_MEM_PACKETS		256
262#define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
263#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
264#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
265
266/* Run time adjustable parameters. */
267__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
268EXPORT_SYMBOL(sysctl_wmem_max);
269__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
270EXPORT_SYMBOL(sysctl_rmem_max);
271__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
272__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
273
274/* Maximal space eaten by iovec or ancillary data plus some space */
275int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
276EXPORT_SYMBOL(sysctl_optmem_max);
277
278struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
279EXPORT_SYMBOL_GPL(memalloc_socks);
280
281/**
282 * sk_set_memalloc - sets %SOCK_MEMALLOC
283 * @sk: socket to set it on
284 *
285 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
286 * It's the responsibility of the admin to adjust min_free_kbytes
287 * to meet the requirements
288 */
289void sk_set_memalloc(struct sock *sk)
290{
291	sock_set_flag(sk, SOCK_MEMALLOC);
292	sk->sk_allocation |= __GFP_MEMALLOC;
293	static_key_slow_inc(&memalloc_socks);
294}
295EXPORT_SYMBOL_GPL(sk_set_memalloc);
296
297void sk_clear_memalloc(struct sock *sk)
298{
299	sock_reset_flag(sk, SOCK_MEMALLOC);
300	sk->sk_allocation &= ~__GFP_MEMALLOC;
301	static_key_slow_dec(&memalloc_socks);
302
303	/*
304	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
305	 * progress of swapping. However, if SOCK_MEMALLOC is cleared while
306	 * it has rmem allocations there is a risk that the user of the
307	 * socket cannot make forward progress due to exceeding the rmem
308	 * limits. By rights, sk_clear_memalloc() should only be called
309	 * on sockets being torn down but warn and reset the accounting if
310	 * that assumption breaks.
311	 */
312	if (WARN_ON(sk->sk_forward_alloc))
313		sk_mem_reclaim(sk);
314}
315EXPORT_SYMBOL_GPL(sk_clear_memalloc);
316
317int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
318{
319	int ret;
320	unsigned long pflags = current->flags;
321
322	/* these should have been dropped before queueing */
323	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
324
325	current->flags |= PF_MEMALLOC;
326	ret = sk->sk_backlog_rcv(sk, skb);
327	tsk_restore_flags(current, pflags, PF_MEMALLOC);
328
329	return ret;
330}
331EXPORT_SYMBOL(__sk_backlog_rcv);
332
333static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
334{
335	struct timeval tv;
336
337	if (optlen < sizeof(tv))
338		return -EINVAL;
339	if (copy_from_user(&tv, optval, sizeof(tv)))
340		return -EFAULT;
341	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
342		return -EDOM;
343
344	if (tv.tv_sec < 0) {
345		static int warned __read_mostly;
346
347		*timeo_p = 0;
348		if (warned < 10 && net_ratelimit()) {
349			warned++;
350			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
351				__func__, current->comm, task_pid_nr(current));
352		}
353		return 0;
354	}
355	*timeo_p = MAX_SCHEDULE_TIMEOUT;
356	if (tv.tv_sec == 0 && tv.tv_usec == 0)
357		return 0;
358	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
359		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
360	return 0;
361}
362
363static void sock_warn_obsolete_bsdism(const char *name)
364{
365	static int warned;
366	static char warncomm[TASK_COMM_LEN];
367	if (strcmp(warncomm, current->comm) && warned < 5) {
368		strcpy(warncomm,  current->comm);
369		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
370			warncomm, name);
371		warned++;
372	}
373}
374
375#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
376
377static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
378{
379	if (sk->sk_flags & flags) {
380		sk->sk_flags &= ~flags;
381		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
382			net_disable_timestamp();
383	}
384}
385
386
387int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
388{
389	int err;
390	int skb_len;
391	unsigned long flags;
392	struct sk_buff_head *list = &sk->sk_receive_queue;
393
394	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
395		atomic_inc(&sk->sk_drops);
396		trace_sock_rcvqueue_full(sk, skb);
397		return -ENOMEM;
398	}
399
400	err = sk_filter(sk, skb);
401	if (err)
402		return err;
403
404	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
405		atomic_inc(&sk->sk_drops);
406		return -ENOBUFS;
407	}
408
409	skb->dev = NULL;
410	skb_set_owner_r(skb, sk);
411
412	/* Cache the SKB length before we tack it onto the receive
413	 * queue.  Once it is added it no longer belongs to us and
414	 * may be freed by other threads of control pulling packets
415	 * from the queue.
416	 */
417	skb_len = skb->len;
418
419	/* we escape from rcu protected region, make sure we dont leak
420	 * a norefcounted dst
421	 */
422	skb_dst_force(skb);
423
424	spin_lock_irqsave(&list->lock, flags);
425	skb->dropcount = atomic_read(&sk->sk_drops);
426	__skb_queue_tail(list, skb);
427	spin_unlock_irqrestore(&list->lock, flags);
428
429	if (!sock_flag(sk, SOCK_DEAD))
430		sk->sk_data_ready(sk, skb_len);
431	return 0;
432}
433EXPORT_SYMBOL(sock_queue_rcv_skb);
434
435int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
436{
437	int rc = NET_RX_SUCCESS;
438
439	if (sk_filter(sk, skb))
440		goto discard_and_relse;
441
442	skb->dev = NULL;
443
444	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
445		atomic_inc(&sk->sk_drops);
446		goto discard_and_relse;
447	}
448	if (nested)
449		bh_lock_sock_nested(sk);
450	else
451		bh_lock_sock(sk);
452	if (!sock_owned_by_user(sk)) {
453		/*
454		 * trylock + unlock semantics:
455		 */
456		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
457
458		rc = sk_backlog_rcv(sk, skb);
459
460		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
461	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
462		bh_unlock_sock(sk);
463		atomic_inc(&sk->sk_drops);
464		goto discard_and_relse;
465	}
466
467	bh_unlock_sock(sk);
468out:
469	sock_put(sk);
470	return rc;
471discard_and_relse:
472	kfree_skb(skb);
473	goto out;
474}
475EXPORT_SYMBOL(sk_receive_skb);
476
477void sk_reset_txq(struct sock *sk)
478{
479	sk_tx_queue_clear(sk);
480}
481EXPORT_SYMBOL(sk_reset_txq);
482
483struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
484{
485	struct dst_entry *dst = __sk_dst_get(sk);
486
487	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
488		sk_tx_queue_clear(sk);
489		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
490		dst_release(dst);
491		return NULL;
492	}
493
494	return dst;
495}
496EXPORT_SYMBOL(__sk_dst_check);
497
498struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
499{
500	struct dst_entry *dst = sk_dst_get(sk);
501
502	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
503		sk_dst_reset(sk);
504		dst_release(dst);
505		return NULL;
506	}
507
508	return dst;
509}
510EXPORT_SYMBOL(sk_dst_check);
511
512static int sock_setbindtodevice(struct sock *sk, char __user *optval,
513				int optlen)
514{
515	int ret = -ENOPROTOOPT;
516#ifdef CONFIG_NETDEVICES
517	struct net *net = sock_net(sk);
518	char devname[IFNAMSIZ];
519	int index;
520
521	/* Sorry... */
522	ret = -EPERM;
523	if (!ns_capable(net->user_ns, CAP_NET_RAW))
524		goto out;
525
526	ret = -EINVAL;
527	if (optlen < 0)
528		goto out;
529
530	/* Bind this socket to a particular device like "eth0",
531	 * as specified in the passed interface name. If the
532	 * name is "" or the option length is zero the socket
533	 * is not bound.
534	 */
535	if (optlen > IFNAMSIZ - 1)
536		optlen = IFNAMSIZ - 1;
537	memset(devname, 0, sizeof(devname));
538
539	ret = -EFAULT;
540	if (copy_from_user(devname, optval, optlen))
541		goto out;
542
543	index = 0;
544	if (devname[0] != '\0') {
545		struct net_device *dev;
546
547		rcu_read_lock();
548		dev = dev_get_by_name_rcu(net, devname);
549		if (dev)
550			index = dev->ifindex;
551		rcu_read_unlock();
552		ret = -ENODEV;
553		if (!dev)
554			goto out;
555	}
556
557	lock_sock(sk);
558	sk->sk_bound_dev_if = index;
559	sk_dst_reset(sk);
560	release_sock(sk);
561
562	ret = 0;
563
564out:
565#endif
566
567	return ret;
568}
569
570static int sock_getbindtodevice(struct sock *sk, char __user *optval,
571				int __user *optlen, int len)
572{
573	int ret = -ENOPROTOOPT;
574#ifdef CONFIG_NETDEVICES
575	struct net *net = sock_net(sk);
576	struct net_device *dev;
577	char devname[IFNAMSIZ];
578	unsigned seq;
579
580	if (sk->sk_bound_dev_if == 0) {
581		len = 0;
582		goto zero;
583	}
584
585	ret = -EINVAL;
586	if (len < IFNAMSIZ)
587		goto out;
588
589retry:
590	seq = read_seqcount_begin(&devnet_rename_seq);
591	rcu_read_lock();
592	dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
593	ret = -ENODEV;
594	if (!dev) {
595		rcu_read_unlock();
596		goto out;
597	}
598
599	strcpy(devname, dev->name);
600	rcu_read_unlock();
601	if (read_seqcount_retry(&devnet_rename_seq, seq))
602		goto retry;
603
604	len = strlen(devname) + 1;
605
606	ret = -EFAULT;
607	if (copy_to_user(optval, devname, len))
608		goto out;
609
610zero:
611	ret = -EFAULT;
612	if (put_user(len, optlen))
613		goto out;
614
615	ret = 0;
616
617out:
618#endif
619
620	return ret;
621}
622
623static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
624{
625	if (valbool)
626		sock_set_flag(sk, bit);
627	else
628		sock_reset_flag(sk, bit);
629}
630
631/*
632 *	This is meant for all protocols to use and covers goings on
633 *	at the socket level. Everything here is generic.
634 */
635
636int sock_setsockopt(struct socket *sock, int level, int optname,
637		    char __user *optval, unsigned int optlen)
638{
639	struct sock *sk = sock->sk;
640	int val;
641	int valbool;
642	struct linger ling;
643	int ret = 0;
644
645	/*
646	 *	Options without arguments
647	 */
648
649	if (optname == SO_BINDTODEVICE)
650		return sock_setbindtodevice(sk, optval, optlen);
651
652	if (optlen < sizeof(int))
653		return -EINVAL;
654
655	if (get_user(val, (int __user *)optval))
656		return -EFAULT;
657
658	valbool = val ? 1 : 0;
659
660	lock_sock(sk);
661
662	switch (optname) {
663	case SO_DEBUG:
664		if (val && !capable(CAP_NET_ADMIN))
665			ret = -EACCES;
666		else
667			sock_valbool_flag(sk, SOCK_DBG, valbool);
668		break;
669	case SO_REUSEADDR:
670		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
671		break;
672	case SO_REUSEPORT:
673		sk->sk_reuseport = valbool;
674		break;
675	case SO_TYPE:
676	case SO_PROTOCOL:
677	case SO_DOMAIN:
678	case SO_ERROR:
679		ret = -ENOPROTOOPT;
680		break;
681	case SO_DONTROUTE:
682		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
683		break;
684	case SO_BROADCAST:
685		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
686		break;
687	case SO_SNDBUF:
688		/* Don't error on this BSD doesn't and if you think
689		 * about it this is right. Otherwise apps have to
690		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
691		 * are treated in BSD as hints
692		 */
693		val = min_t(u32, val, sysctl_wmem_max);
694set_sndbuf:
695		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
696		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
697		/* Wake up sending tasks if we upped the value. */
698		sk->sk_write_space(sk);
699		break;
700
701	case SO_SNDBUFFORCE:
702		if (!capable(CAP_NET_ADMIN)) {
703			ret = -EPERM;
704			break;
705		}
706		goto set_sndbuf;
707
708	case SO_RCVBUF:
709		/* Don't error on this BSD doesn't and if you think
710		 * about it this is right. Otherwise apps have to
711		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
712		 * are treated in BSD as hints
713		 */
714		val = min_t(u32, val, sysctl_rmem_max);
715set_rcvbuf:
716		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
717		/*
718		 * We double it on the way in to account for
719		 * "struct sk_buff" etc. overhead.   Applications
720		 * assume that the SO_RCVBUF setting they make will
721		 * allow that much actual data to be received on that
722		 * socket.
723		 *
724		 * Applications are unaware that "struct sk_buff" and
725		 * other overheads allocate from the receive buffer
726		 * during socket buffer allocation.
727		 *
728		 * And after considering the possible alternatives,
729		 * returning the value we actually used in getsockopt
730		 * is the most desirable behavior.
731		 */
732		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
733		break;
734
735	case SO_RCVBUFFORCE:
736		if (!capable(CAP_NET_ADMIN)) {
737			ret = -EPERM;
738			break;
739		}
740		goto set_rcvbuf;
741
742	case SO_KEEPALIVE:
743#ifdef CONFIG_INET
744		if (sk->sk_protocol == IPPROTO_TCP &&
745		    sk->sk_type == SOCK_STREAM)
746			tcp_set_keepalive(sk, valbool);
747#endif
748		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
749		break;
750
751	case SO_OOBINLINE:
752		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
753		break;
754
755	case SO_NO_CHECK:
756		sk->sk_no_check = valbool;
757		break;
758
759	case SO_PRIORITY:
760		if ((val >= 0 && val <= 6) ||
761		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
762			sk->sk_priority = val;
763		else
764			ret = -EPERM;
765		break;
766
767	case SO_LINGER:
768		if (optlen < sizeof(ling)) {
769			ret = -EINVAL;	/* 1003.1g */
770			break;
771		}
772		if (copy_from_user(&ling, optval, sizeof(ling))) {
773			ret = -EFAULT;
774			break;
775		}
776		if (!ling.l_onoff)
777			sock_reset_flag(sk, SOCK_LINGER);
778		else {
779#if (BITS_PER_LONG == 32)
780			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
781				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
782			else
783#endif
784				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
785			sock_set_flag(sk, SOCK_LINGER);
786		}
787		break;
788
789	case SO_BSDCOMPAT:
790		sock_warn_obsolete_bsdism("setsockopt");
791		break;
792
793	case SO_PASSCRED:
794		if (valbool)
795			set_bit(SOCK_PASSCRED, &sock->flags);
796		else
797			clear_bit(SOCK_PASSCRED, &sock->flags);
798		break;
799
800	case SO_TIMESTAMP:
801	case SO_TIMESTAMPNS:
802		if (valbool)  {
803			if (optname == SO_TIMESTAMP)
804				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
805			else
806				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
807			sock_set_flag(sk, SOCK_RCVTSTAMP);
808			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
809		} else {
810			sock_reset_flag(sk, SOCK_RCVTSTAMP);
811			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
812		}
813		break;
814
815	case SO_TIMESTAMPING:
816		if (val & ~SOF_TIMESTAMPING_MASK) {
817			ret = -EINVAL;
818			break;
819		}
820		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
821				  val & SOF_TIMESTAMPING_TX_HARDWARE);
822		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
823				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
824		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
825				  val & SOF_TIMESTAMPING_RX_HARDWARE);
826		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
827			sock_enable_timestamp(sk,
828					      SOCK_TIMESTAMPING_RX_SOFTWARE);
829		else
830			sock_disable_timestamp(sk,
831					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
832		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
833				  val & SOF_TIMESTAMPING_SOFTWARE);
834		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
835				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
836		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
837				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
838		break;
839
840	case SO_RCVLOWAT:
841		if (val < 0)
842			val = INT_MAX;
843		sk->sk_rcvlowat = val ? : 1;
844		break;
845
846	case SO_RCVTIMEO:
847		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
848		break;
849
850	case SO_SNDTIMEO:
851		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
852		break;
853
854	case SO_ATTACH_FILTER:
855		ret = -EINVAL;
856		if (optlen == sizeof(struct sock_fprog)) {
857			struct sock_fprog fprog;
858
859			ret = -EFAULT;
860			if (copy_from_user(&fprog, optval, sizeof(fprog)))
861				break;
862
863			ret = sk_attach_filter(&fprog, sk);
864		}
865		break;
866
867	case SO_DETACH_FILTER:
868		ret = sk_detach_filter(sk);
869		break;
870
871	case SO_LOCK_FILTER:
872		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
873			ret = -EPERM;
874		else
875			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
876		break;
877
878	case SO_PASSSEC:
879		if (valbool)
880			set_bit(SOCK_PASSSEC, &sock->flags);
881		else
882			clear_bit(SOCK_PASSSEC, &sock->flags);
883		break;
884	case SO_MARK:
885		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
886			ret = -EPERM;
887		else
888			sk->sk_mark = val;
889		break;
890
891		/* We implement the SO_SNDLOWAT etc to
892		   not be settable (1003.1g 5.3) */
893	case SO_RXQ_OVFL:
894		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
895		break;
896
897	case SO_WIFI_STATUS:
898		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
899		break;
900
901	case SO_PEEK_OFF:
902		if (sock->ops->set_peek_off)
903			sock->ops->set_peek_off(sk, val);
904		else
905			ret = -EOPNOTSUPP;
906		break;
907
908	case SO_NOFCS:
909		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
910		break;
911
912	case SO_SELECT_ERR_QUEUE:
913		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
914		break;
915
916#ifdef CONFIG_NET_LL_RX_POLL
917	case SO_LL:
918		/* allow unprivileged users to decrease the value */
919		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
920			ret = -EPERM;
921		else {
922			if (val < 0)
923				ret = -EINVAL;
924			else
925				sk->sk_ll_usec = val;
926		}
927		break;
928#endif
929	default:
930		ret = -ENOPROTOOPT;
931		break;
932	}
933	release_sock(sk);
934	return ret;
935}
936EXPORT_SYMBOL(sock_setsockopt);
937
938
939void cred_to_ucred(struct pid *pid, const struct cred *cred,
940		   struct ucred *ucred)
941{
942	ucred->pid = pid_vnr(pid);
943	ucred->uid = ucred->gid = -1;
944	if (cred) {
945		struct user_namespace *current_ns = current_user_ns();
946
947		ucred->uid = from_kuid_munged(current_ns, cred->euid);
948		ucred->gid = from_kgid_munged(current_ns, cred->egid);
949	}
950}
951EXPORT_SYMBOL_GPL(cred_to_ucred);
952
953int sock_getsockopt(struct socket *sock, int level, int optname,
954		    char __user *optval, int __user *optlen)
955{
956	struct sock *sk = sock->sk;
957
958	union {
959		int val;
960		struct linger ling;
961		struct timeval tm;
962	} v;
963
964	int lv = sizeof(int);
965	int len;
966
967	if (get_user(len, optlen))
968		return -EFAULT;
969	if (len < 0)
970		return -EINVAL;
971
972	memset(&v, 0, sizeof(v));
973
974	switch (optname) {
975	case SO_DEBUG:
976		v.val = sock_flag(sk, SOCK_DBG);
977		break;
978
979	case SO_DONTROUTE:
980		v.val = sock_flag(sk, SOCK_LOCALROUTE);
981		break;
982
983	case SO_BROADCAST:
984		v.val = sock_flag(sk, SOCK_BROADCAST);
985		break;
986
987	case SO_SNDBUF:
988		v.val = sk->sk_sndbuf;
989		break;
990
991	case SO_RCVBUF:
992		v.val = sk->sk_rcvbuf;
993		break;
994
995	case SO_REUSEADDR:
996		v.val = sk->sk_reuse;
997		break;
998
999	case SO_REUSEPORT:
1000		v.val = sk->sk_reuseport;
1001		break;
1002
1003	case SO_KEEPALIVE:
1004		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1005		break;
1006
1007	case SO_TYPE:
1008		v.val = sk->sk_type;
1009		break;
1010
1011	case SO_PROTOCOL:
1012		v.val = sk->sk_protocol;
1013		break;
1014
1015	case SO_DOMAIN:
1016		v.val = sk->sk_family;
1017		break;
1018
1019	case SO_ERROR:
1020		v.val = -sock_error(sk);
1021		if (v.val == 0)
1022			v.val = xchg(&sk->sk_err_soft, 0);
1023		break;
1024
1025	case SO_OOBINLINE:
1026		v.val = sock_flag(sk, SOCK_URGINLINE);
1027		break;
1028
1029	case SO_NO_CHECK:
1030		v.val = sk->sk_no_check;
1031		break;
1032
1033	case SO_PRIORITY:
1034		v.val = sk->sk_priority;
1035		break;
1036
1037	case SO_LINGER:
1038		lv		= sizeof(v.ling);
1039		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1040		v.ling.l_linger	= sk->sk_lingertime / HZ;
1041		break;
1042
1043	case SO_BSDCOMPAT:
1044		sock_warn_obsolete_bsdism("getsockopt");
1045		break;
1046
1047	case SO_TIMESTAMP:
1048		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1049				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1050		break;
1051
1052	case SO_TIMESTAMPNS:
1053		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1054		break;
1055
1056	case SO_TIMESTAMPING:
1057		v.val = 0;
1058		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
1059			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
1060		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
1061			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
1062		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
1063			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
1064		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1065			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
1066		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
1067			v.val |= SOF_TIMESTAMPING_SOFTWARE;
1068		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
1069			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
1070		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
1071			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
1072		break;
1073
1074	case SO_RCVTIMEO:
1075		lv = sizeof(struct timeval);
1076		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1077			v.tm.tv_sec = 0;
1078			v.tm.tv_usec = 0;
1079		} else {
1080			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1081			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1082		}
1083		break;
1084
1085	case SO_SNDTIMEO:
1086		lv = sizeof(struct timeval);
1087		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1088			v.tm.tv_sec = 0;
1089			v.tm.tv_usec = 0;
1090		} else {
1091			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1092			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1093		}
1094		break;
1095
1096	case SO_RCVLOWAT:
1097		v.val = sk->sk_rcvlowat;
1098		break;
1099
1100	case SO_SNDLOWAT:
1101		v.val = 1;
1102		break;
1103
1104	case SO_PASSCRED:
1105		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1106		break;
1107
1108	case SO_PEERCRED:
1109	{
1110		struct ucred peercred;
1111		if (len > sizeof(peercred))
1112			len = sizeof(peercred);
1113		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1114		if (copy_to_user(optval, &peercred, len))
1115			return -EFAULT;
1116		goto lenout;
1117	}
1118
1119	case SO_PEERNAME:
1120	{
1121		char address[128];
1122
1123		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1124			return -ENOTCONN;
1125		if (lv < len)
1126			return -EINVAL;
1127		if (copy_to_user(optval, address, len))
1128			return -EFAULT;
1129		goto lenout;
1130	}
1131
1132	/* Dubious BSD thing... Probably nobody even uses it, but
1133	 * the UNIX standard wants it for whatever reason... -DaveM
1134	 */
1135	case SO_ACCEPTCONN:
1136		v.val = sk->sk_state == TCP_LISTEN;
1137		break;
1138
1139	case SO_PASSSEC:
1140		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1141		break;
1142
1143	case SO_PEERSEC:
1144		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1145
1146	case SO_MARK:
1147		v.val = sk->sk_mark;
1148		break;
1149
1150	case SO_RXQ_OVFL:
1151		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1152		break;
1153
1154	case SO_WIFI_STATUS:
1155		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1156		break;
1157
1158	case SO_PEEK_OFF:
1159		if (!sock->ops->set_peek_off)
1160			return -EOPNOTSUPP;
1161
1162		v.val = sk->sk_peek_off;
1163		break;
1164	case SO_NOFCS:
1165		v.val = sock_flag(sk, SOCK_NOFCS);
1166		break;
1167
1168	case SO_BINDTODEVICE:
1169		return sock_getbindtodevice(sk, optval, optlen, len);
1170
1171	case SO_GET_FILTER:
1172		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1173		if (len < 0)
1174			return len;
1175
1176		goto lenout;
1177
1178	case SO_LOCK_FILTER:
1179		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1180		break;
1181
1182	case SO_SELECT_ERR_QUEUE:
1183		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1184		break;
1185
1186#ifdef CONFIG_NET_LL_RX_POLL
1187	case SO_LL:
1188		v.val = sk->sk_ll_usec;
1189		break;
1190#endif
1191
1192	default:
1193		return -ENOPROTOOPT;
1194	}
1195
1196	if (len > lv)
1197		len = lv;
1198	if (copy_to_user(optval, &v, len))
1199		return -EFAULT;
1200lenout:
1201	if (put_user(len, optlen))
1202		return -EFAULT;
1203	return 0;
1204}
1205
1206/*
1207 * Initialize an sk_lock.
1208 *
1209 * (We also register the sk_lock with the lock validator.)
1210 */
1211static inline void sock_lock_init(struct sock *sk)
1212{
1213	sock_lock_init_class_and_name(sk,
1214			af_family_slock_key_strings[sk->sk_family],
1215			af_family_slock_keys + sk->sk_family,
1216			af_family_key_strings[sk->sk_family],
1217			af_family_keys + sk->sk_family);
1218}
1219
1220/*
1221 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1222 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1223 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1224 */
1225static void sock_copy(struct sock *nsk, const struct sock *osk)
1226{
1227#ifdef CONFIG_SECURITY_NETWORK
1228	void *sptr = nsk->sk_security;
1229#endif
1230	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1231
1232	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1233	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1234
1235#ifdef CONFIG_SECURITY_NETWORK
1236	nsk->sk_security = sptr;
1237	security_sk_clone(osk, nsk);
1238#endif
1239}
1240
1241void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1242{
1243	unsigned long nulls1, nulls2;
1244
1245	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1246	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1247	if (nulls1 > nulls2)
1248		swap(nulls1, nulls2);
1249
1250	if (nulls1 != 0)
1251		memset((char *)sk, 0, nulls1);
1252	memset((char *)sk + nulls1 + sizeof(void *), 0,
1253	       nulls2 - nulls1 - sizeof(void *));
1254	memset((char *)sk + nulls2 + sizeof(void *), 0,
1255	       size - nulls2 - sizeof(void *));
1256}
1257EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1258
1259static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1260		int family)
1261{
1262	struct sock *sk;
1263	struct kmem_cache *slab;
1264
1265	slab = prot->slab;
1266	if (slab != NULL) {
1267		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1268		if (!sk)
1269			return sk;
1270		if (priority & __GFP_ZERO) {
1271			if (prot->clear_sk)
1272				prot->clear_sk(sk, prot->obj_size);
1273			else
1274				sk_prot_clear_nulls(sk, prot->obj_size);
1275		}
1276	} else
1277		sk = kmalloc(prot->obj_size, priority);
1278
1279	if (sk != NULL) {
1280		kmemcheck_annotate_bitfield(sk, flags);
1281
1282		if (security_sk_alloc(sk, family, priority))
1283			goto out_free;
1284
1285		if (!try_module_get(prot->owner))
1286			goto out_free_sec;
1287		sk_tx_queue_clear(sk);
1288	}
1289
1290	return sk;
1291
1292out_free_sec:
1293	security_sk_free(sk);
1294out_free:
1295	if (slab != NULL)
1296		kmem_cache_free(slab, sk);
1297	else
1298		kfree(sk);
1299	return NULL;
1300}
1301
1302static void sk_prot_free(struct proto *prot, struct sock *sk)
1303{
1304	struct kmem_cache *slab;
1305	struct module *owner;
1306
1307	owner = prot->owner;
1308	slab = prot->slab;
1309
1310	security_sk_free(sk);
1311	if (slab != NULL)
1312		kmem_cache_free(slab, sk);
1313	else
1314		kfree(sk);
1315	module_put(owner);
1316}
1317
1318#if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
1319void sock_update_classid(struct sock *sk)
1320{
1321	u32 classid;
1322
1323	classid = task_cls_classid(current);
1324	if (classid != sk->sk_classid)
1325		sk->sk_classid = classid;
1326}
1327EXPORT_SYMBOL(sock_update_classid);
1328#endif
1329
1330#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1331void sock_update_netprioidx(struct sock *sk)
1332{
1333	if (in_interrupt())
1334		return;
1335
1336	sk->sk_cgrp_prioidx = task_netprioidx(current);
1337}
1338EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1339#endif
1340
1341/**
1342 *	sk_alloc - All socket objects are allocated here
1343 *	@net: the applicable net namespace
1344 *	@family: protocol family
1345 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1346 *	@prot: struct proto associated with this new sock instance
1347 */
1348struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1349		      struct proto *prot)
1350{
1351	struct sock *sk;
1352
1353	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1354	if (sk) {
1355		sk->sk_family = family;
1356		/*
1357		 * See comment in struct sock definition to understand
1358		 * why we need sk_prot_creator -acme
1359		 */
1360		sk->sk_prot = sk->sk_prot_creator = prot;
1361		sock_lock_init(sk);
1362		sock_net_set(sk, get_net(net));
1363		atomic_set(&sk->sk_wmem_alloc, 1);
1364
1365		sock_update_classid(sk);
1366		sock_update_netprioidx(sk);
1367	}
1368
1369	return sk;
1370}
1371EXPORT_SYMBOL(sk_alloc);
1372
1373static void __sk_free(struct sock *sk)
1374{
1375	struct sk_filter *filter;
1376
1377	if (sk->sk_destruct)
1378		sk->sk_destruct(sk);
1379
1380	filter = rcu_dereference_check(sk->sk_filter,
1381				       atomic_read(&sk->sk_wmem_alloc) == 0);
1382	if (filter) {
1383		sk_filter_uncharge(sk, filter);
1384		RCU_INIT_POINTER(sk->sk_filter, NULL);
1385	}
1386
1387	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1388
1389	if (atomic_read(&sk->sk_omem_alloc))
1390		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1391			 __func__, atomic_read(&sk->sk_omem_alloc));
1392
1393	if (sk->sk_peer_cred)
1394		put_cred(sk->sk_peer_cred);
1395	put_pid(sk->sk_peer_pid);
1396	put_net(sock_net(sk));
1397	sk_prot_free(sk->sk_prot_creator, sk);
1398}
1399
1400void sk_free(struct sock *sk)
1401{
1402	/*
1403	 * We subtract one from sk_wmem_alloc and can know if
1404	 * some packets are still in some tx queue.
1405	 * If not null, sock_wfree() will call __sk_free(sk) later
1406	 */
1407	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1408		__sk_free(sk);
1409}
1410EXPORT_SYMBOL(sk_free);
1411
1412/*
1413 * Last sock_put should drop reference to sk->sk_net. It has already
1414 * been dropped in sk_change_net. Taking reference to stopping namespace
1415 * is not an option.
1416 * Take reference to a socket to remove it from hash _alive_ and after that
1417 * destroy it in the context of init_net.
1418 */
1419void sk_release_kernel(struct sock *sk)
1420{
1421	if (sk == NULL || sk->sk_socket == NULL)
1422		return;
1423
1424	sock_hold(sk);
1425	sock_release(sk->sk_socket);
1426	release_net(sock_net(sk));
1427	sock_net_set(sk, get_net(&init_net));
1428	sock_put(sk);
1429}
1430EXPORT_SYMBOL(sk_release_kernel);
1431
1432static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1433{
1434	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1435		sock_update_memcg(newsk);
1436}
1437
1438/**
1439 *	sk_clone_lock - clone a socket, and lock its clone
1440 *	@sk: the socket to clone
1441 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1442 *
1443 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1444 */
1445struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1446{
1447	struct sock *newsk;
1448
1449	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1450	if (newsk != NULL) {
1451		struct sk_filter *filter;
1452
1453		sock_copy(newsk, sk);
1454
1455		/* SANITY */
1456		get_net(sock_net(newsk));
1457		sk_node_init(&newsk->sk_node);
1458		sock_lock_init(newsk);
1459		bh_lock_sock(newsk);
1460		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1461		newsk->sk_backlog.len = 0;
1462
1463		atomic_set(&newsk->sk_rmem_alloc, 0);
1464		/*
1465		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1466		 */
1467		atomic_set(&newsk->sk_wmem_alloc, 1);
1468		atomic_set(&newsk->sk_omem_alloc, 0);
1469		skb_queue_head_init(&newsk->sk_receive_queue);
1470		skb_queue_head_init(&newsk->sk_write_queue);
1471#ifdef CONFIG_NET_DMA
1472		skb_queue_head_init(&newsk->sk_async_wait_queue);
1473#endif
1474
1475		spin_lock_init(&newsk->sk_dst_lock);
1476		rwlock_init(&newsk->sk_callback_lock);
1477		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1478				af_callback_keys + newsk->sk_family,
1479				af_family_clock_key_strings[newsk->sk_family]);
1480
1481		newsk->sk_dst_cache	= NULL;
1482		newsk->sk_wmem_queued	= 0;
1483		newsk->sk_forward_alloc = 0;
1484		newsk->sk_send_head	= NULL;
1485		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1486
1487		sock_reset_flag(newsk, SOCK_DONE);
1488		skb_queue_head_init(&newsk->sk_error_queue);
1489
1490		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1491		if (filter != NULL)
1492			sk_filter_charge(newsk, filter);
1493
1494		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1495			/* It is still raw copy of parent, so invalidate
1496			 * destructor and make plain sk_free() */
1497			newsk->sk_destruct = NULL;
1498			bh_unlock_sock(newsk);
1499			sk_free(newsk);
1500			newsk = NULL;
1501			goto out;
1502		}
1503
1504		newsk->sk_err	   = 0;
1505		newsk->sk_priority = 0;
1506		/*
1507		 * Before updating sk_refcnt, we must commit prior changes to memory
1508		 * (Documentation/RCU/rculist_nulls.txt for details)
1509		 */
1510		smp_wmb();
1511		atomic_set(&newsk->sk_refcnt, 2);
1512
1513		/*
1514		 * Increment the counter in the same struct proto as the master
1515		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1516		 * is the same as sk->sk_prot->socks, as this field was copied
1517		 * with memcpy).
1518		 *
1519		 * This _changes_ the previous behaviour, where
1520		 * tcp_create_openreq_child always was incrementing the
1521		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1522		 * to be taken into account in all callers. -acme
1523		 */
1524		sk_refcnt_debug_inc(newsk);
1525		sk_set_socket(newsk, NULL);
1526		newsk->sk_wq = NULL;
1527
1528		sk_update_clone(sk, newsk);
1529
1530		if (newsk->sk_prot->sockets_allocated)
1531			sk_sockets_allocated_inc(newsk);
1532
1533		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1534			net_enable_timestamp();
1535	}
1536out:
1537	return newsk;
1538}
1539EXPORT_SYMBOL_GPL(sk_clone_lock);
1540
1541void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1542{
1543	__sk_dst_set(sk, dst);
1544	sk->sk_route_caps = dst->dev->features;
1545	if (sk->sk_route_caps & NETIF_F_GSO)
1546		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1547	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1548	if (sk_can_gso(sk)) {
1549		if (dst->header_len) {
1550			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1551		} else {
1552			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1553			sk->sk_gso_max_size = dst->dev->gso_max_size;
1554			sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1555		}
1556	}
1557}
1558EXPORT_SYMBOL_GPL(sk_setup_caps);
1559
1560/*
1561 *	Simple resource managers for sockets.
1562 */
1563
1564
1565/*
1566 * Write buffer destructor automatically called from kfree_skb.
1567 */
1568void sock_wfree(struct sk_buff *skb)
1569{
1570	struct sock *sk = skb->sk;
1571	unsigned int len = skb->truesize;
1572
1573	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1574		/*
1575		 * Keep a reference on sk_wmem_alloc, this will be released
1576		 * after sk_write_space() call
1577		 */
1578		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1579		sk->sk_write_space(sk);
1580		len = 1;
1581	}
1582	/*
1583	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1584	 * could not do because of in-flight packets
1585	 */
1586	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1587		__sk_free(sk);
1588}
1589EXPORT_SYMBOL(sock_wfree);
1590
1591/*
1592 * Read buffer destructor automatically called from kfree_skb.
1593 */
1594void sock_rfree(struct sk_buff *skb)
1595{
1596	struct sock *sk = skb->sk;
1597	unsigned int len = skb->truesize;
1598
1599	atomic_sub(len, &sk->sk_rmem_alloc);
1600	sk_mem_uncharge(sk, len);
1601}
1602EXPORT_SYMBOL(sock_rfree);
1603
1604void sock_edemux(struct sk_buff *skb)
1605{
1606	struct sock *sk = skb->sk;
1607
1608#ifdef CONFIG_INET
1609	if (sk->sk_state == TCP_TIME_WAIT)
1610		inet_twsk_put(inet_twsk(sk));
1611	else
1612#endif
1613		sock_put(sk);
1614}
1615EXPORT_SYMBOL(sock_edemux);
1616
1617kuid_t sock_i_uid(struct sock *sk)
1618{
1619	kuid_t uid;
1620
1621	read_lock_bh(&sk->sk_callback_lock);
1622	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1623	read_unlock_bh(&sk->sk_callback_lock);
1624	return uid;
1625}
1626EXPORT_SYMBOL(sock_i_uid);
1627
1628unsigned long sock_i_ino(struct sock *sk)
1629{
1630	unsigned long ino;
1631
1632	read_lock_bh(&sk->sk_callback_lock);
1633	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1634	read_unlock_bh(&sk->sk_callback_lock);
1635	return ino;
1636}
1637EXPORT_SYMBOL(sock_i_ino);
1638
1639/*
1640 * Allocate a skb from the socket's send buffer.
1641 */
1642struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1643			     gfp_t priority)
1644{
1645	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1646		struct sk_buff *skb = alloc_skb(size, priority);
1647		if (skb) {
1648			skb_set_owner_w(skb, sk);
1649			return skb;
1650		}
1651	}
1652	return NULL;
1653}
1654EXPORT_SYMBOL(sock_wmalloc);
1655
1656/*
1657 * Allocate a skb from the socket's receive buffer.
1658 */
1659struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1660			     gfp_t priority)
1661{
1662	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1663		struct sk_buff *skb = alloc_skb(size, priority);
1664		if (skb) {
1665			skb_set_owner_r(skb, sk);
1666			return skb;
1667		}
1668	}
1669	return NULL;
1670}
1671
1672/*
1673 * Allocate a memory block from the socket's option memory buffer.
1674 */
1675void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1676{
1677	if ((unsigned int)size <= sysctl_optmem_max &&
1678	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1679		void *mem;
1680		/* First do the add, to avoid the race if kmalloc
1681		 * might sleep.
1682		 */
1683		atomic_add(size, &sk->sk_omem_alloc);
1684		mem = kmalloc(size, priority);
1685		if (mem)
1686			return mem;
1687		atomic_sub(size, &sk->sk_omem_alloc);
1688	}
1689	return NULL;
1690}
1691EXPORT_SYMBOL(sock_kmalloc);
1692
1693/*
1694 * Free an option memory block.
1695 */
1696void sock_kfree_s(struct sock *sk, void *mem, int size)
1697{
1698	kfree(mem);
1699	atomic_sub(size, &sk->sk_omem_alloc);
1700}
1701EXPORT_SYMBOL(sock_kfree_s);
1702
1703/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1704   I think, these locks should be removed for datagram sockets.
1705 */
1706static long sock_wait_for_wmem(struct sock *sk, long timeo)
1707{
1708	DEFINE_WAIT(wait);
1709
1710	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1711	for (;;) {
1712		if (!timeo)
1713			break;
1714		if (signal_pending(current))
1715			break;
1716		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1717		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1718		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1719			break;
1720		if (sk->sk_shutdown & SEND_SHUTDOWN)
1721			break;
1722		if (sk->sk_err)
1723			break;
1724		timeo = schedule_timeout(timeo);
1725	}
1726	finish_wait(sk_sleep(sk), &wait);
1727	return timeo;
1728}
1729
1730
1731/*
1732 *	Generic send/receive buffer handlers
1733 */
1734
1735struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1736				     unsigned long data_len, int noblock,
1737				     int *errcode)
1738{
1739	struct sk_buff *skb;
1740	gfp_t gfp_mask;
1741	long timeo;
1742	int err;
1743	int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1744
1745	err = -EMSGSIZE;
1746	if (npages > MAX_SKB_FRAGS)
1747		goto failure;
1748
1749	gfp_mask = sk->sk_allocation;
1750	if (gfp_mask & __GFP_WAIT)
1751		gfp_mask |= __GFP_REPEAT;
1752
1753	timeo = sock_sndtimeo(sk, noblock);
1754	while (1) {
1755		err = sock_error(sk);
1756		if (err != 0)
1757			goto failure;
1758
1759		err = -EPIPE;
1760		if (sk->sk_shutdown & SEND_SHUTDOWN)
1761			goto failure;
1762
1763		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1764			skb = alloc_skb(header_len, gfp_mask);
1765			if (skb) {
1766				int i;
1767
1768				/* No pages, we're done... */
1769				if (!data_len)
1770					break;
1771
1772				skb->truesize += data_len;
1773				skb_shinfo(skb)->nr_frags = npages;
1774				for (i = 0; i < npages; i++) {
1775					struct page *page;
1776
1777					page = alloc_pages(sk->sk_allocation, 0);
1778					if (!page) {
1779						err = -ENOBUFS;
1780						skb_shinfo(skb)->nr_frags = i;
1781						kfree_skb(skb);
1782						goto failure;
1783					}
1784
1785					__skb_fill_page_desc(skb, i,
1786							page, 0,
1787							(data_len >= PAGE_SIZE ?
1788							 PAGE_SIZE :
1789							 data_len));
1790					data_len -= PAGE_SIZE;
1791				}
1792
1793				/* Full success... */
1794				break;
1795			}
1796			err = -ENOBUFS;
1797			goto failure;
1798		}
1799		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1800		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1801		err = -EAGAIN;
1802		if (!timeo)
1803			goto failure;
1804		if (signal_pending(current))
1805			goto interrupted;
1806		timeo = sock_wait_for_wmem(sk, timeo);
1807	}
1808
1809	skb_set_owner_w(skb, sk);
1810	return skb;
1811
1812interrupted:
1813	err = sock_intr_errno(timeo);
1814failure:
1815	*errcode = err;
1816	return NULL;
1817}
1818EXPORT_SYMBOL(sock_alloc_send_pskb);
1819
1820struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1821				    int noblock, int *errcode)
1822{
1823	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1824}
1825EXPORT_SYMBOL(sock_alloc_send_skb);
1826
1827/* On 32bit arches, an skb frag is limited to 2^15 */
1828#define SKB_FRAG_PAGE_ORDER	get_order(32768)
1829
1830bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1831{
1832	int order;
1833
1834	if (pfrag->page) {
1835		if (atomic_read(&pfrag->page->_count) == 1) {
1836			pfrag->offset = 0;
1837			return true;
1838		}
1839		if (pfrag->offset < pfrag->size)
1840			return true;
1841		put_page(pfrag->page);
1842	}
1843
1844	/* We restrict high order allocations to users that can afford to wait */
1845	order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1846
1847	do {
1848		gfp_t gfp = sk->sk_allocation;
1849
1850		if (order)
1851			gfp |= __GFP_COMP | __GFP_NOWARN;
1852		pfrag->page = alloc_pages(gfp, order);
1853		if (likely(pfrag->page)) {
1854			pfrag->offset = 0;
1855			pfrag->size = PAGE_SIZE << order;
1856			return true;
1857		}
1858	} while (--order >= 0);
1859
1860	sk_enter_memory_pressure(sk);
1861	sk_stream_moderate_sndbuf(sk);
1862	return false;
1863}
1864EXPORT_SYMBOL(sk_page_frag_refill);
1865
1866static void __lock_sock(struct sock *sk)
1867	__releases(&sk->sk_lock.slock)
1868	__acquires(&sk->sk_lock.slock)
1869{
1870	DEFINE_WAIT(wait);
1871
1872	for (;;) {
1873		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1874					TASK_UNINTERRUPTIBLE);
1875		spin_unlock_bh(&sk->sk_lock.slock);
1876		schedule();
1877		spin_lock_bh(&sk->sk_lock.slock);
1878		if (!sock_owned_by_user(sk))
1879			break;
1880	}
1881	finish_wait(&sk->sk_lock.wq, &wait);
1882}
1883
1884static void __release_sock(struct sock *sk)
1885	__releases(&sk->sk_lock.slock)
1886	__acquires(&sk->sk_lock.slock)
1887{
1888	struct sk_buff *skb = sk->sk_backlog.head;
1889
1890	do {
1891		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1892		bh_unlock_sock(sk);
1893
1894		do {
1895			struct sk_buff *next = skb->next;
1896
1897			prefetch(next);
1898			WARN_ON_ONCE(skb_dst_is_noref(skb));
1899			skb->next = NULL;
1900			sk_backlog_rcv(sk, skb);
1901
1902			/*
1903			 * We are in process context here with softirqs
1904			 * disabled, use cond_resched_softirq() to preempt.
1905			 * This is safe to do because we've taken the backlog
1906			 * queue private:
1907			 */
1908			cond_resched_softirq();
1909
1910			skb = next;
1911		} while (skb != NULL);
1912
1913		bh_lock_sock(sk);
1914	} while ((skb = sk->sk_backlog.head) != NULL);
1915
1916	/*
1917	 * Doing the zeroing here guarantee we can not loop forever
1918	 * while a wild producer attempts to flood us.
1919	 */
1920	sk->sk_backlog.len = 0;
1921}
1922
1923/**
1924 * sk_wait_data - wait for data to arrive at sk_receive_queue
1925 * @sk:    sock to wait on
1926 * @timeo: for how long
1927 *
1928 * Now socket state including sk->sk_err is changed only under lock,
1929 * hence we may omit checks after joining wait queue.
1930 * We check receive queue before schedule() only as optimization;
1931 * it is very likely that release_sock() added new data.
1932 */
1933int sk_wait_data(struct sock *sk, long *timeo)
1934{
1935	int rc;
1936	DEFINE_WAIT(wait);
1937
1938	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1939	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1940	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1941	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1942	finish_wait(sk_sleep(sk), &wait);
1943	return rc;
1944}
1945EXPORT_SYMBOL(sk_wait_data);
1946
1947/**
1948 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1949 *	@sk: socket
1950 *	@size: memory size to allocate
1951 *	@kind: allocation type
1952 *
1953 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1954 *	rmem allocation. This function assumes that protocols which have
1955 *	memory_pressure use sk_wmem_queued as write buffer accounting.
1956 */
1957int __sk_mem_schedule(struct sock *sk, int size, int kind)
1958{
1959	struct proto *prot = sk->sk_prot;
1960	int amt = sk_mem_pages(size);
1961	long allocated;
1962	int parent_status = UNDER_LIMIT;
1963
1964	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1965
1966	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1967
1968	/* Under limit. */
1969	if (parent_status == UNDER_LIMIT &&
1970			allocated <= sk_prot_mem_limits(sk, 0)) {
1971		sk_leave_memory_pressure(sk);
1972		return 1;
1973	}
1974
1975	/* Under pressure. (we or our parents) */
1976	if ((parent_status > SOFT_LIMIT) ||
1977			allocated > sk_prot_mem_limits(sk, 1))
1978		sk_enter_memory_pressure(sk);
1979
1980	/* Over hard limit (we or our parents) */
1981	if ((parent_status == OVER_LIMIT) ||
1982			(allocated > sk_prot_mem_limits(sk, 2)))
1983		goto suppress_allocation;
1984
1985	/* guarantee minimum buffer size under pressure */
1986	if (kind == SK_MEM_RECV) {
1987		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1988			return 1;
1989
1990	} else { /* SK_MEM_SEND */
1991		if (sk->sk_type == SOCK_STREAM) {
1992			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1993				return 1;
1994		} else if (atomic_read(&sk->sk_wmem_alloc) <
1995			   prot->sysctl_wmem[0])
1996				return 1;
1997	}
1998
1999	if (sk_has_memory_pressure(sk)) {
2000		int alloc;
2001
2002		if (!sk_under_memory_pressure(sk))
2003			return 1;
2004		alloc = sk_sockets_allocated_read_positive(sk);
2005		if (sk_prot_mem_limits(sk, 2) > alloc *
2006		    sk_mem_pages(sk->sk_wmem_queued +
2007				 atomic_read(&sk->sk_rmem_alloc) +
2008				 sk->sk_forward_alloc))
2009			return 1;
2010	}
2011
2012suppress_allocation:
2013
2014	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2015		sk_stream_moderate_sndbuf(sk);
2016
2017		/* Fail only if socket is _under_ its sndbuf.
2018		 * In this case we cannot block, so that we have to fail.
2019		 */
2020		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2021			return 1;
2022	}
2023
2024	trace_sock_exceed_buf_limit(sk, prot, allocated);
2025
2026	/* Alas. Undo changes. */
2027	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2028
2029	sk_memory_allocated_sub(sk, amt);
2030
2031	return 0;
2032}
2033EXPORT_SYMBOL(__sk_mem_schedule);
2034
2035/**
2036 *	__sk_reclaim - reclaim memory_allocated
2037 *	@sk: socket
2038 */
2039void __sk_mem_reclaim(struct sock *sk)
2040{
2041	sk_memory_allocated_sub(sk,
2042				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2043	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2044
2045	if (sk_under_memory_pressure(sk) &&
2046	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2047		sk_leave_memory_pressure(sk);
2048}
2049EXPORT_SYMBOL(__sk_mem_reclaim);
2050
2051
2052/*
2053 * Set of default routines for initialising struct proto_ops when
2054 * the protocol does not support a particular function. In certain
2055 * cases where it makes no sense for a protocol to have a "do nothing"
2056 * function, some default processing is provided.
2057 */
2058
2059int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2060{
2061	return -EOPNOTSUPP;
2062}
2063EXPORT_SYMBOL(sock_no_bind);
2064
2065int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2066		    int len, int flags)
2067{
2068	return -EOPNOTSUPP;
2069}
2070EXPORT_SYMBOL(sock_no_connect);
2071
2072int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2073{
2074	return -EOPNOTSUPP;
2075}
2076EXPORT_SYMBOL(sock_no_socketpair);
2077
2078int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2079{
2080	return -EOPNOTSUPP;
2081}
2082EXPORT_SYMBOL(sock_no_accept);
2083
2084int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2085		    int *len, int peer)
2086{
2087	return -EOPNOTSUPP;
2088}
2089EXPORT_SYMBOL(sock_no_getname);
2090
2091unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2092{
2093	return 0;
2094}
2095EXPORT_SYMBOL(sock_no_poll);
2096
2097int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2098{
2099	return -EOPNOTSUPP;
2100}
2101EXPORT_SYMBOL(sock_no_ioctl);
2102
2103int sock_no_listen(struct socket *sock, int backlog)
2104{
2105	return -EOPNOTSUPP;
2106}
2107EXPORT_SYMBOL(sock_no_listen);
2108
2109int sock_no_shutdown(struct socket *sock, int how)
2110{
2111	return -EOPNOTSUPP;
2112}
2113EXPORT_SYMBOL(sock_no_shutdown);
2114
2115int sock_no_setsockopt(struct socket *sock, int level, int optname,
2116		    char __user *optval, unsigned int optlen)
2117{
2118	return -EOPNOTSUPP;
2119}
2120EXPORT_SYMBOL(sock_no_setsockopt);
2121
2122int sock_no_getsockopt(struct socket *sock, int level, int optname,
2123		    char __user *optval, int __user *optlen)
2124{
2125	return -EOPNOTSUPP;
2126}
2127EXPORT_SYMBOL(sock_no_getsockopt);
2128
2129int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2130		    size_t len)
2131{
2132	return -EOPNOTSUPP;
2133}
2134EXPORT_SYMBOL(sock_no_sendmsg);
2135
2136int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2137		    size_t len, int flags)
2138{
2139	return -EOPNOTSUPP;
2140}
2141EXPORT_SYMBOL(sock_no_recvmsg);
2142
2143int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2144{
2145	/* Mirror missing mmap method error code */
2146	return -ENODEV;
2147}
2148EXPORT_SYMBOL(sock_no_mmap);
2149
2150ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2151{
2152	ssize_t res;
2153	struct msghdr msg = {.msg_flags = flags};
2154	struct kvec iov;
2155	char *kaddr = kmap(page);
2156	iov.iov_base = kaddr + offset;
2157	iov.iov_len = size;
2158	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2159	kunmap(page);
2160	return res;
2161}
2162EXPORT_SYMBOL(sock_no_sendpage);
2163
2164/*
2165 *	Default Socket Callbacks
2166 */
2167
2168static void sock_def_wakeup(struct sock *sk)
2169{
2170	struct socket_wq *wq;
2171
2172	rcu_read_lock();
2173	wq = rcu_dereference(sk->sk_wq);
2174	if (wq_has_sleeper(wq))
2175		wake_up_interruptible_all(&wq->wait);
2176	rcu_read_unlock();
2177}
2178
2179static void sock_def_error_report(struct sock *sk)
2180{
2181	struct socket_wq *wq;
2182
2183	rcu_read_lock();
2184	wq = rcu_dereference(sk->sk_wq);
2185	if (wq_has_sleeper(wq))
2186		wake_up_interruptible_poll(&wq->wait, POLLERR);
2187	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2188	rcu_read_unlock();
2189}
2190
2191static void sock_def_readable(struct sock *sk, int len)
2192{
2193	struct socket_wq *wq;
2194
2195	rcu_read_lock();
2196	wq = rcu_dereference(sk->sk_wq);
2197	if (wq_has_sleeper(wq))
2198		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2199						POLLRDNORM | POLLRDBAND);
2200	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2201	rcu_read_unlock();
2202}
2203
2204static void sock_def_write_space(struct sock *sk)
2205{
2206	struct socket_wq *wq;
2207
2208	rcu_read_lock();
2209
2210	/* Do not wake up a writer until he can make "significant"
2211	 * progress.  --DaveM
2212	 */
2213	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2214		wq = rcu_dereference(sk->sk_wq);
2215		if (wq_has_sleeper(wq))
2216			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2217						POLLWRNORM | POLLWRBAND);
2218
2219		/* Should agree with poll, otherwise some programs break */
2220		if (sock_writeable(sk))
2221			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2222	}
2223
2224	rcu_read_unlock();
2225}
2226
2227static void sock_def_destruct(struct sock *sk)
2228{
2229	kfree(sk->sk_protinfo);
2230}
2231
2232void sk_send_sigurg(struct sock *sk)
2233{
2234	if (sk->sk_socket && sk->sk_socket->file)
2235		if (send_sigurg(&sk->sk_socket->file->f_owner))
2236			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2237}
2238EXPORT_SYMBOL(sk_send_sigurg);
2239
2240void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2241		    unsigned long expires)
2242{
2243	if (!mod_timer(timer, expires))
2244		sock_hold(sk);
2245}
2246EXPORT_SYMBOL(sk_reset_timer);
2247
2248void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2249{
2250	if (del_timer(timer))
2251		__sock_put(sk);
2252}
2253EXPORT_SYMBOL(sk_stop_timer);
2254
2255void sock_init_data(struct socket *sock, struct sock *sk)
2256{
2257	skb_queue_head_init(&sk->sk_receive_queue);
2258	skb_queue_head_init(&sk->sk_write_queue);
2259	skb_queue_head_init(&sk->sk_error_queue);
2260#ifdef CONFIG_NET_DMA
2261	skb_queue_head_init(&sk->sk_async_wait_queue);
2262#endif
2263
2264	sk->sk_send_head	=	NULL;
2265
2266	init_timer(&sk->sk_timer);
2267
2268	sk->sk_allocation	=	GFP_KERNEL;
2269	sk->sk_rcvbuf		=	sysctl_rmem_default;
2270	sk->sk_sndbuf		=	sysctl_wmem_default;
2271	sk->sk_state		=	TCP_CLOSE;
2272	sk_set_socket(sk, sock);
2273
2274	sock_set_flag(sk, SOCK_ZAPPED);
2275
2276	if (sock) {
2277		sk->sk_type	=	sock->type;
2278		sk->sk_wq	=	sock->wq;
2279		sock->sk	=	sk;
2280	} else
2281		sk->sk_wq	=	NULL;
2282
2283	spin_lock_init(&sk->sk_dst_lock);
2284	rwlock_init(&sk->sk_callback_lock);
2285	lockdep_set_class_and_name(&sk->sk_callback_lock,
2286			af_callback_keys + sk->sk_family,
2287			af_family_clock_key_strings[sk->sk_family]);
2288
2289	sk->sk_state_change	=	sock_def_wakeup;
2290	sk->sk_data_ready	=	sock_def_readable;
2291	sk->sk_write_space	=	sock_def_write_space;
2292	sk->sk_error_report	=	sock_def_error_report;
2293	sk->sk_destruct		=	sock_def_destruct;
2294
2295	sk->sk_frag.page	=	NULL;
2296	sk->sk_frag.offset	=	0;
2297	sk->sk_peek_off		=	-1;
2298
2299	sk->sk_peer_pid 	=	NULL;
2300	sk->sk_peer_cred	=	NULL;
2301	sk->sk_write_pending	=	0;
2302	sk->sk_rcvlowat		=	1;
2303	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2304	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2305
2306	sk->sk_stamp = ktime_set(-1L, 0);
2307
2308#ifdef CONFIG_NET_LL_RX_POLL
2309	sk->sk_napi_id		=	0;
2310	sk->sk_ll_usec		=	sysctl_net_ll_read;
2311#endif
2312
2313	/*
2314	 * Before updating sk_refcnt, we must commit prior changes to memory
2315	 * (Documentation/RCU/rculist_nulls.txt for details)
2316	 */
2317	smp_wmb();
2318	atomic_set(&sk->sk_refcnt, 1);
2319	atomic_set(&sk->sk_drops, 0);
2320}
2321EXPORT_SYMBOL(sock_init_data);
2322
2323void lock_sock_nested(struct sock *sk, int subclass)
2324{
2325	might_sleep();
2326	spin_lock_bh(&sk->sk_lock.slock);
2327	if (sk->sk_lock.owned)
2328		__lock_sock(sk);
2329	sk->sk_lock.owned = 1;
2330	spin_unlock(&sk->sk_lock.slock);
2331	/*
2332	 * The sk_lock has mutex_lock() semantics here:
2333	 */
2334	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2335	local_bh_enable();
2336}
2337EXPORT_SYMBOL(lock_sock_nested);
2338
2339void release_sock(struct sock *sk)
2340{
2341	/*
2342	 * The sk_lock has mutex_unlock() semantics:
2343	 */
2344	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2345
2346	spin_lock_bh(&sk->sk_lock.slock);
2347	if (sk->sk_backlog.tail)
2348		__release_sock(sk);
2349
2350	if (sk->sk_prot->release_cb)
2351		sk->sk_prot->release_cb(sk);
2352
2353	sk->sk_lock.owned = 0;
2354	if (waitqueue_active(&sk->sk_lock.wq))
2355		wake_up(&sk->sk_lock.wq);
2356	spin_unlock_bh(&sk->sk_lock.slock);
2357}
2358EXPORT_SYMBOL(release_sock);
2359
2360/**
2361 * lock_sock_fast - fast version of lock_sock
2362 * @sk: socket
2363 *
2364 * This version should be used for very small section, where process wont block
2365 * return false if fast path is taken
2366 *   sk_lock.slock locked, owned = 0, BH disabled
2367 * return true if slow path is taken
2368 *   sk_lock.slock unlocked, owned = 1, BH enabled
2369 */
2370bool lock_sock_fast(struct sock *sk)
2371{
2372	might_sleep();
2373	spin_lock_bh(&sk->sk_lock.slock);
2374
2375	if (!sk->sk_lock.owned)
2376		/*
2377		 * Note : We must disable BH
2378		 */
2379		return false;
2380
2381	__lock_sock(sk);
2382	sk->sk_lock.owned = 1;
2383	spin_unlock(&sk->sk_lock.slock);
2384	/*
2385	 * The sk_lock has mutex_lock() semantics here:
2386	 */
2387	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2388	local_bh_enable();
2389	return true;
2390}
2391EXPORT_SYMBOL(lock_sock_fast);
2392
2393int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2394{
2395	struct timeval tv;
2396	if (!sock_flag(sk, SOCK_TIMESTAMP))
2397		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2398	tv = ktime_to_timeval(sk->sk_stamp);
2399	if (tv.tv_sec == -1)
2400		return -ENOENT;
2401	if (tv.tv_sec == 0) {
2402		sk->sk_stamp = ktime_get_real();
2403		tv = ktime_to_timeval(sk->sk_stamp);
2404	}
2405	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2406}
2407EXPORT_SYMBOL(sock_get_timestamp);
2408
2409int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2410{
2411	struct timespec ts;
2412	if (!sock_flag(sk, SOCK_TIMESTAMP))
2413		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2414	ts = ktime_to_timespec(sk->sk_stamp);
2415	if (ts.tv_sec == -1)
2416		return -ENOENT;
2417	if (ts.tv_sec == 0) {
2418		sk->sk_stamp = ktime_get_real();
2419		ts = ktime_to_timespec(sk->sk_stamp);
2420	}
2421	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2422}
2423EXPORT_SYMBOL(sock_get_timestampns);
2424
2425void sock_enable_timestamp(struct sock *sk, int flag)
2426{
2427	if (!sock_flag(sk, flag)) {
2428		unsigned long previous_flags = sk->sk_flags;
2429
2430		sock_set_flag(sk, flag);
2431		/*
2432		 * we just set one of the two flags which require net
2433		 * time stamping, but time stamping might have been on
2434		 * already because of the other one
2435		 */
2436		if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2437			net_enable_timestamp();
2438	}
2439}
2440
2441/*
2442 *	Get a socket option on an socket.
2443 *
2444 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2445 *	asynchronous errors should be reported by getsockopt. We assume
2446 *	this means if you specify SO_ERROR (otherwise whats the point of it).
2447 */
2448int sock_common_getsockopt(struct socket *sock, int level, int optname,
2449			   char __user *optval, int __user *optlen)
2450{
2451	struct sock *sk = sock->sk;
2452
2453	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2454}
2455EXPORT_SYMBOL(sock_common_getsockopt);
2456
2457#ifdef CONFIG_COMPAT
2458int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2459				  char __user *optval, int __user *optlen)
2460{
2461	struct sock *sk = sock->sk;
2462
2463	if (sk->sk_prot->compat_getsockopt != NULL)
2464		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2465						      optval, optlen);
2466	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2467}
2468EXPORT_SYMBOL(compat_sock_common_getsockopt);
2469#endif
2470
2471int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2472			struct msghdr *msg, size_t size, int flags)
2473{
2474	struct sock *sk = sock->sk;
2475	int addr_len = 0;
2476	int err;
2477
2478	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2479				   flags & ~MSG_DONTWAIT, &addr_len);
2480	if (err >= 0)
2481		msg->msg_namelen = addr_len;
2482	return err;
2483}
2484EXPORT_SYMBOL(sock_common_recvmsg);
2485
2486/*
2487 *	Set socket options on an inet socket.
2488 */
2489int sock_common_setsockopt(struct socket *sock, int level, int optname,
2490			   char __user *optval, unsigned int optlen)
2491{
2492	struct sock *sk = sock->sk;
2493
2494	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2495}
2496EXPORT_SYMBOL(sock_common_setsockopt);
2497
2498#ifdef CONFIG_COMPAT
2499int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2500				  char __user *optval, unsigned int optlen)
2501{
2502	struct sock *sk = sock->sk;
2503
2504	if (sk->sk_prot->compat_setsockopt != NULL)
2505		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2506						      optval, optlen);
2507	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2508}
2509EXPORT_SYMBOL(compat_sock_common_setsockopt);
2510#endif
2511
2512void sk_common_release(struct sock *sk)
2513{
2514	if (sk->sk_prot->destroy)
2515		sk->sk_prot->destroy(sk);
2516
2517	/*
2518	 * Observation: when sock_common_release is called, processes have
2519	 * no access to socket. But net still has.
2520	 * Step one, detach it from networking:
2521	 *
2522	 * A. Remove from hash tables.
2523	 */
2524
2525	sk->sk_prot->unhash(sk);
2526
2527	/*
2528	 * In this point socket cannot receive new packets, but it is possible
2529	 * that some packets are in flight because some CPU runs receiver and
2530	 * did hash table lookup before we unhashed socket. They will achieve
2531	 * receive queue and will be purged by socket destructor.
2532	 *
2533	 * Also we still have packets pending on receive queue and probably,
2534	 * our own packets waiting in device queues. sock_destroy will drain
2535	 * receive queue, but transmitted packets will delay socket destruction
2536	 * until the last reference will be released.
2537	 */
2538
2539	sock_orphan(sk);
2540
2541	xfrm_sk_free_policy(sk);
2542
2543	sk_refcnt_debug_release(sk);
2544
2545	if (sk->sk_frag.page) {
2546		put_page(sk->sk_frag.page);
2547		sk->sk_frag.page = NULL;
2548	}
2549
2550	sock_put(sk);
2551}
2552EXPORT_SYMBOL(sk_common_release);
2553
2554#ifdef CONFIG_PROC_FS
2555#define PROTO_INUSE_NR	64	/* should be enough for the first time */
2556struct prot_inuse {
2557	int val[PROTO_INUSE_NR];
2558};
2559
2560static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2561
2562#ifdef CONFIG_NET_NS
2563void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2564{
2565	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2566}
2567EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2568
2569int sock_prot_inuse_get(struct net *net, struct proto *prot)
2570{
2571	int cpu, idx = prot->inuse_idx;
2572	int res = 0;
2573
2574	for_each_possible_cpu(cpu)
2575		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2576
2577	return res >= 0 ? res : 0;
2578}
2579EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2580
2581static int __net_init sock_inuse_init_net(struct net *net)
2582{
2583	net->core.inuse = alloc_percpu(struct prot_inuse);
2584	return net->core.inuse ? 0 : -ENOMEM;
2585}
2586
2587static void __net_exit sock_inuse_exit_net(struct net *net)
2588{
2589	free_percpu(net->core.inuse);
2590}
2591
2592static struct pernet_operations net_inuse_ops = {
2593	.init = sock_inuse_init_net,
2594	.exit = sock_inuse_exit_net,
2595};
2596
2597static __init int net_inuse_init(void)
2598{
2599	if (register_pernet_subsys(&net_inuse_ops))
2600		panic("Cannot initialize net inuse counters");
2601
2602	return 0;
2603}
2604
2605core_initcall(net_inuse_init);
2606#else
2607static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2608
2609void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2610{
2611	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2612}
2613EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2614
2615int sock_prot_inuse_get(struct net *net, struct proto *prot)
2616{
2617	int cpu, idx = prot->inuse_idx;
2618	int res = 0;
2619
2620	for_each_possible_cpu(cpu)
2621		res += per_cpu(prot_inuse, cpu).val[idx];
2622
2623	return res >= 0 ? res : 0;
2624}
2625EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2626#endif
2627
2628static void assign_proto_idx(struct proto *prot)
2629{
2630	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2631
2632	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2633		pr_err("PROTO_INUSE_NR exhausted\n");
2634		return;
2635	}
2636
2637	set_bit(prot->inuse_idx, proto_inuse_idx);
2638}
2639
2640static void release_proto_idx(struct proto *prot)
2641{
2642	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2643		clear_bit(prot->inuse_idx, proto_inuse_idx);
2644}
2645#else
2646static inline void assign_proto_idx(struct proto *prot)
2647{
2648}
2649
2650static inline void release_proto_idx(struct proto *prot)
2651{
2652}
2653#endif
2654
2655int proto_register(struct proto *prot, int alloc_slab)
2656{
2657	if (alloc_slab) {
2658		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2659					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2660					NULL);
2661
2662		if (prot->slab == NULL) {
2663			pr_crit("%s: Can't create sock SLAB cache!\n",
2664				prot->name);
2665			goto out;
2666		}
2667
2668		if (prot->rsk_prot != NULL) {
2669			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2670			if (prot->rsk_prot->slab_name == NULL)
2671				goto out_free_sock_slab;
2672
2673			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2674								 prot->rsk_prot->obj_size, 0,
2675								 SLAB_HWCACHE_ALIGN, NULL);
2676
2677			if (prot->rsk_prot->slab == NULL) {
2678				pr_crit("%s: Can't create request sock SLAB cache!\n",
2679					prot->name);
2680				goto out_free_request_sock_slab_name;
2681			}
2682		}
2683
2684		if (prot->twsk_prot != NULL) {
2685			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2686
2687			if (prot->twsk_prot->twsk_slab_name == NULL)
2688				goto out_free_request_sock_slab;
2689
2690			prot->twsk_prot->twsk_slab =
2691				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2692						  prot->twsk_prot->twsk_obj_size,
2693						  0,
2694						  SLAB_HWCACHE_ALIGN |
2695							prot->slab_flags,
2696						  NULL);
2697			if (prot->twsk_prot->twsk_slab == NULL)
2698				goto out_free_timewait_sock_slab_name;
2699		}
2700	}
2701
2702	mutex_lock(&proto_list_mutex);
2703	list_add(&prot->node, &proto_list);
2704	assign_proto_idx(prot);
2705	mutex_unlock(&proto_list_mutex);
2706	return 0;
2707
2708out_free_timewait_sock_slab_name:
2709	kfree(prot->twsk_prot->twsk_slab_name);
2710out_free_request_sock_slab:
2711	if (prot->rsk_prot && prot->rsk_prot->slab) {
2712		kmem_cache_destroy(prot->rsk_prot->slab);
2713		prot->rsk_prot->slab = NULL;
2714	}
2715out_free_request_sock_slab_name:
2716	if (prot->rsk_prot)
2717		kfree(prot->rsk_prot->slab_name);
2718out_free_sock_slab:
2719	kmem_cache_destroy(prot->slab);
2720	prot->slab = NULL;
2721out:
2722	return -ENOBUFS;
2723}
2724EXPORT_SYMBOL(proto_register);
2725
2726void proto_unregister(struct proto *prot)
2727{
2728	mutex_lock(&proto_list_mutex);
2729	release_proto_idx(prot);
2730	list_del(&prot->node);
2731	mutex_unlock(&proto_list_mutex);
2732
2733	if (prot->slab != NULL) {
2734		kmem_cache_destroy(prot->slab);
2735		prot->slab = NULL;
2736	}
2737
2738	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2739		kmem_cache_destroy(prot->rsk_prot->slab);
2740		kfree(prot->rsk_prot->slab_name);
2741		prot->rsk_prot->slab = NULL;
2742	}
2743
2744	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2745		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2746		kfree(prot->twsk_prot->twsk_slab_name);
2747		prot->twsk_prot->twsk_slab = NULL;
2748	}
2749}
2750EXPORT_SYMBOL(proto_unregister);
2751
2752#ifdef CONFIG_PROC_FS
2753static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2754	__acquires(proto_list_mutex)
2755{
2756	mutex_lock(&proto_list_mutex);
2757	return seq_list_start_head(&proto_list, *pos);
2758}
2759
2760static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2761{
2762	return seq_list_next(v, &proto_list, pos);
2763}
2764
2765static void proto_seq_stop(struct seq_file *seq, void *v)
2766	__releases(proto_list_mutex)
2767{
2768	mutex_unlock(&proto_list_mutex);
2769}
2770
2771static char proto_method_implemented(const void *method)
2772{
2773	return method == NULL ? 'n' : 'y';
2774}
2775static long sock_prot_memory_allocated(struct proto *proto)
2776{
2777	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2778}
2779
2780static char *sock_prot_memory_pressure(struct proto *proto)
2781{
2782	return proto->memory_pressure != NULL ?
2783	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2784}
2785
2786static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2787{
2788
2789	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2790			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2791		   proto->name,
2792		   proto->obj_size,
2793		   sock_prot_inuse_get(seq_file_net(seq), proto),
2794		   sock_prot_memory_allocated(proto),
2795		   sock_prot_memory_pressure(proto),
2796		   proto->max_header,
2797		   proto->slab == NULL ? "no" : "yes",
2798		   module_name(proto->owner),
2799		   proto_method_implemented(proto->close),
2800		   proto_method_implemented(proto->connect),
2801		   proto_method_implemented(proto->disconnect),
2802		   proto_method_implemented(proto->accept),
2803		   proto_method_implemented(proto->ioctl),
2804		   proto_method_implemented(proto->init),
2805		   proto_method_implemented(proto->destroy),
2806		   proto_method_implemented(proto->shutdown),
2807		   proto_method_implemented(proto->setsockopt),
2808		   proto_method_implemented(proto->getsockopt),
2809		   proto_method_implemented(proto->sendmsg),
2810		   proto_method_implemented(proto->recvmsg),
2811		   proto_method_implemented(proto->sendpage),
2812		   proto_method_implemented(proto->bind),
2813		   proto_method_implemented(proto->backlog_rcv),
2814		   proto_method_implemented(proto->hash),
2815		   proto_method_implemented(proto->unhash),
2816		   proto_method_implemented(proto->get_port),
2817		   proto_method_implemented(proto->enter_memory_pressure));
2818}
2819
2820static int proto_seq_show(struct seq_file *seq, void *v)
2821{
2822	if (v == &proto_list)
2823		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2824			   "protocol",
2825			   "size",
2826			   "sockets",
2827			   "memory",
2828			   "press",
2829			   "maxhdr",
2830			   "slab",
2831			   "module",
2832			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2833	else
2834		proto_seq_printf(seq, list_entry(v, struct proto, node));
2835	return 0;
2836}
2837
2838static const struct seq_operations proto_seq_ops = {
2839	.start  = proto_seq_start,
2840	.next   = proto_seq_next,
2841	.stop   = proto_seq_stop,
2842	.show   = proto_seq_show,
2843};
2844
2845static int proto_seq_open(struct inode *inode, struct file *file)
2846{
2847	return seq_open_net(inode, file, &proto_seq_ops,
2848			    sizeof(struct seq_net_private));
2849}
2850
2851static const struct file_operations proto_seq_fops = {
2852	.owner		= THIS_MODULE,
2853	.open		= proto_seq_open,
2854	.read		= seq_read,
2855	.llseek		= seq_lseek,
2856	.release	= seq_release_net,
2857};
2858
2859static __net_init int proto_init_net(struct net *net)
2860{
2861	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2862		return -ENOMEM;
2863
2864	return 0;
2865}
2866
2867static __net_exit void proto_exit_net(struct net *net)
2868{
2869	remove_proc_entry("protocols", net->proc_net);
2870}
2871
2872
2873static __net_initdata struct pernet_operations proto_net_ops = {
2874	.init = proto_init_net,
2875	.exit = proto_exit_net,
2876};
2877
2878static int __init proto_init(void)
2879{
2880	return register_pernet_subsys(&proto_net_ops);
2881}
2882
2883subsys_initcall(proto_init);
2884
2885#endif /* PROC_FS */
2886