sock.c revision 7eec4174ff29cd42f2acfae8112f51c228545d40
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 *		Alan Cox	: 	Numerous verify_area() problems
17 *		Alan Cox	:	Connecting on a connecting socket
18 *					now returns an error for tcp.
19 *		Alan Cox	:	sock->protocol is set correctly.
20 *					and is not sometimes left as 0.
21 *		Alan Cox	:	connect handles icmp errors on a
22 *					connect properly. Unfortunately there
23 *					is a restart syscall nasty there. I
24 *					can't match BSD without hacking the C
25 *					library. Ideas urgently sought!
26 *		Alan Cox	:	Disallow bind() to addresses that are
27 *					not ours - especially broadcast ones!!
28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30 *					instead they leave that for the DESTROY timer.
31 *		Alan Cox	:	Clean up error flag in accept
32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33 *					was buggy. Put a remove_sock() in the handler
34 *					for memory when we hit 0. Also altered the timer
35 *					code. The ACK stuff can wait and needs major
36 *					TCP layer surgery.
37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38 *					and fixed timer/inet_bh race.
39 *		Alan Cox	:	Added zapped flag for TCP
40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47 *	Pauline Middelink	:	identd support
48 *		Alan Cox	:	Fixed connect() taking signals I think.
49 *		Alan Cox	:	SO_LINGER supported
50 *		Alan Cox	:	Error reporting fixes
51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52 *		Alan Cox	:	inet sockets don't set sk->type!
53 *		Alan Cox	:	Split socket option code
54 *		Alan Cox	:	Callbacks
55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56 *		Alex		:	Removed restriction on inet fioctl
57 *		Alan Cox	:	Splitting INET from NET core
58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60 *		Alan Cox	:	Split IP from generic code
61 *		Alan Cox	:	New kfree_skbmem()
62 *		Alan Cox	:	Make SO_DEBUG superuser only.
63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64 *					(compatibility fix)
65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66 *		Alan Cox	:	Allocator for a socket is settable.
67 *		Alan Cox	:	SO_ERROR includes soft errors.
68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69 *		Alan Cox	: 	Generic socket allocation to make hooks
70 *					easier (suggested by Craig Metz).
71 *		Michael Pall	:	SO_ERROR returns positive errno again
72 *              Steve Whitehouse:       Added default destructor to free
73 *                                      protocol private data.
74 *              Steve Whitehouse:       Added various other default routines
75 *                                      common to several socket families.
76 *              Chris Evans     :       Call suser() check last on F_SETOWN
77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79 *		Andi Kleen	:	Fix write_space callback
80 *		Chris Evans	:	Security fixes - signedness again
81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 *		This program is free software; you can redistribute it and/or
87 *		modify it under the terms of the GNU General Public License
88 *		as published by the Free Software Foundation; either version
89 *		2 of the License, or (at your option) any later version.
90 */
91
92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
94#include <linux/capability.h>
95#include <linux/errno.h>
96#include <linux/errqueue.h>
97#include <linux/types.h>
98#include <linux/socket.h>
99#include <linux/in.h>
100#include <linux/kernel.h>
101#include <linux/module.h>
102#include <linux/proc_fs.h>
103#include <linux/seq_file.h>
104#include <linux/sched.h>
105#include <linux/timer.h>
106#include <linux/string.h>
107#include <linux/sockios.h>
108#include <linux/net.h>
109#include <linux/mm.h>
110#include <linux/slab.h>
111#include <linux/interrupt.h>
112#include <linux/poll.h>
113#include <linux/tcp.h>
114#include <linux/init.h>
115#include <linux/highmem.h>
116#include <linux/user_namespace.h>
117#include <linux/static_key.h>
118#include <linux/memcontrol.h>
119#include <linux/prefetch.h>
120
121#include <asm/uaccess.h>
122
123#include <linux/netdevice.h>
124#include <net/protocol.h>
125#include <linux/skbuff.h>
126#include <net/net_namespace.h>
127#include <net/request_sock.h>
128#include <net/sock.h>
129#include <linux/net_tstamp.h>
130#include <net/xfrm.h>
131#include <linux/ipsec.h>
132#include <net/cls_cgroup.h>
133#include <net/netprio_cgroup.h>
134
135#include <linux/filter.h>
136
137#include <trace/events/sock.h>
138
139#ifdef CONFIG_INET
140#include <net/tcp.h>
141#endif
142
143#include <net/busy_poll.h>
144
145static DEFINE_MUTEX(proto_list_mutex);
146static LIST_HEAD(proto_list);
147
148#ifdef CONFIG_MEMCG_KMEM
149int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
150{
151	struct proto *proto;
152	int ret = 0;
153
154	mutex_lock(&proto_list_mutex);
155	list_for_each_entry(proto, &proto_list, node) {
156		if (proto->init_cgroup) {
157			ret = proto->init_cgroup(memcg, ss);
158			if (ret)
159				goto out;
160		}
161	}
162
163	mutex_unlock(&proto_list_mutex);
164	return ret;
165out:
166	list_for_each_entry_continue_reverse(proto, &proto_list, node)
167		if (proto->destroy_cgroup)
168			proto->destroy_cgroup(memcg);
169	mutex_unlock(&proto_list_mutex);
170	return ret;
171}
172
173void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
174{
175	struct proto *proto;
176
177	mutex_lock(&proto_list_mutex);
178	list_for_each_entry_reverse(proto, &proto_list, node)
179		if (proto->destroy_cgroup)
180			proto->destroy_cgroup(memcg);
181	mutex_unlock(&proto_list_mutex);
182}
183#endif
184
185/*
186 * Each address family might have different locking rules, so we have
187 * one slock key per address family:
188 */
189static struct lock_class_key af_family_keys[AF_MAX];
190static struct lock_class_key af_family_slock_keys[AF_MAX];
191
192#if defined(CONFIG_MEMCG_KMEM)
193struct static_key memcg_socket_limit_enabled;
194EXPORT_SYMBOL(memcg_socket_limit_enabled);
195#endif
196
197/*
198 * Make lock validator output more readable. (we pre-construct these
199 * strings build-time, so that runtime initialization of socket
200 * locks is fast):
201 */
202static const char *const af_family_key_strings[AF_MAX+1] = {
203  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
204  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
205  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
206  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
207  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
208  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
209  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
210  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
211  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
212  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
213  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
214  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
215  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
216  "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
217};
218static const char *const af_family_slock_key_strings[AF_MAX+1] = {
219  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
220  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
221  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
222  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
223  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
224  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
225  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
226  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
227  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
228  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
229  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
230  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
231  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
232  "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
233};
234static const char *const af_family_clock_key_strings[AF_MAX+1] = {
235  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
236  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
237  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
238  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
239  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
240  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
241  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
242  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
243  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
244  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
245  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
246  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
247  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
248  "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
249};
250
251/*
252 * sk_callback_lock locking rules are per-address-family,
253 * so split the lock classes by using a per-AF key:
254 */
255static struct lock_class_key af_callback_keys[AF_MAX];
256
257/* Take into consideration the size of the struct sk_buff overhead in the
258 * determination of these values, since that is non-constant across
259 * platforms.  This makes socket queueing behavior and performance
260 * not depend upon such differences.
261 */
262#define _SK_MEM_PACKETS		256
263#define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
264#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
265#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
266
267/* Run time adjustable parameters. */
268__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
269EXPORT_SYMBOL(sysctl_wmem_max);
270__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
271EXPORT_SYMBOL(sysctl_rmem_max);
272__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
273__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
274
275/* Maximal space eaten by iovec or ancillary data plus some space */
276int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
277EXPORT_SYMBOL(sysctl_optmem_max);
278
279struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
280EXPORT_SYMBOL_GPL(memalloc_socks);
281
282/**
283 * sk_set_memalloc - sets %SOCK_MEMALLOC
284 * @sk: socket to set it on
285 *
286 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
287 * It's the responsibility of the admin to adjust min_free_kbytes
288 * to meet the requirements
289 */
290void sk_set_memalloc(struct sock *sk)
291{
292	sock_set_flag(sk, SOCK_MEMALLOC);
293	sk->sk_allocation |= __GFP_MEMALLOC;
294	static_key_slow_inc(&memalloc_socks);
295}
296EXPORT_SYMBOL_GPL(sk_set_memalloc);
297
298void sk_clear_memalloc(struct sock *sk)
299{
300	sock_reset_flag(sk, SOCK_MEMALLOC);
301	sk->sk_allocation &= ~__GFP_MEMALLOC;
302	static_key_slow_dec(&memalloc_socks);
303
304	/*
305	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
306	 * progress of swapping. However, if SOCK_MEMALLOC is cleared while
307	 * it has rmem allocations there is a risk that the user of the
308	 * socket cannot make forward progress due to exceeding the rmem
309	 * limits. By rights, sk_clear_memalloc() should only be called
310	 * on sockets being torn down but warn and reset the accounting if
311	 * that assumption breaks.
312	 */
313	if (WARN_ON(sk->sk_forward_alloc))
314		sk_mem_reclaim(sk);
315}
316EXPORT_SYMBOL_GPL(sk_clear_memalloc);
317
318int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
319{
320	int ret;
321	unsigned long pflags = current->flags;
322
323	/* these should have been dropped before queueing */
324	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
325
326	current->flags |= PF_MEMALLOC;
327	ret = sk->sk_backlog_rcv(sk, skb);
328	tsk_restore_flags(current, pflags, PF_MEMALLOC);
329
330	return ret;
331}
332EXPORT_SYMBOL(__sk_backlog_rcv);
333
334static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
335{
336	struct timeval tv;
337
338	if (optlen < sizeof(tv))
339		return -EINVAL;
340	if (copy_from_user(&tv, optval, sizeof(tv)))
341		return -EFAULT;
342	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
343		return -EDOM;
344
345	if (tv.tv_sec < 0) {
346		static int warned __read_mostly;
347
348		*timeo_p = 0;
349		if (warned < 10 && net_ratelimit()) {
350			warned++;
351			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
352				__func__, current->comm, task_pid_nr(current));
353		}
354		return 0;
355	}
356	*timeo_p = MAX_SCHEDULE_TIMEOUT;
357	if (tv.tv_sec == 0 && tv.tv_usec == 0)
358		return 0;
359	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
360		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
361	return 0;
362}
363
364static void sock_warn_obsolete_bsdism(const char *name)
365{
366	static int warned;
367	static char warncomm[TASK_COMM_LEN];
368	if (strcmp(warncomm, current->comm) && warned < 5) {
369		strcpy(warncomm,  current->comm);
370		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
371			warncomm, name);
372		warned++;
373	}
374}
375
376#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
377
378static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
379{
380	if (sk->sk_flags & flags) {
381		sk->sk_flags &= ~flags;
382		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
383			net_disable_timestamp();
384	}
385}
386
387
388int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
389{
390	int err;
391	int skb_len;
392	unsigned long flags;
393	struct sk_buff_head *list = &sk->sk_receive_queue;
394
395	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
396		atomic_inc(&sk->sk_drops);
397		trace_sock_rcvqueue_full(sk, skb);
398		return -ENOMEM;
399	}
400
401	err = sk_filter(sk, skb);
402	if (err)
403		return err;
404
405	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
406		atomic_inc(&sk->sk_drops);
407		return -ENOBUFS;
408	}
409
410	skb->dev = NULL;
411	skb_set_owner_r(skb, sk);
412
413	/* Cache the SKB length before we tack it onto the receive
414	 * queue.  Once it is added it no longer belongs to us and
415	 * may be freed by other threads of control pulling packets
416	 * from the queue.
417	 */
418	skb_len = skb->len;
419
420	/* we escape from rcu protected region, make sure we dont leak
421	 * a norefcounted dst
422	 */
423	skb_dst_force(skb);
424
425	spin_lock_irqsave(&list->lock, flags);
426	skb->dropcount = atomic_read(&sk->sk_drops);
427	__skb_queue_tail(list, skb);
428	spin_unlock_irqrestore(&list->lock, flags);
429
430	if (!sock_flag(sk, SOCK_DEAD))
431		sk->sk_data_ready(sk, skb_len);
432	return 0;
433}
434EXPORT_SYMBOL(sock_queue_rcv_skb);
435
436int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
437{
438	int rc = NET_RX_SUCCESS;
439
440	if (sk_filter(sk, skb))
441		goto discard_and_relse;
442
443	skb->dev = NULL;
444
445	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
446		atomic_inc(&sk->sk_drops);
447		goto discard_and_relse;
448	}
449	if (nested)
450		bh_lock_sock_nested(sk);
451	else
452		bh_lock_sock(sk);
453	if (!sock_owned_by_user(sk)) {
454		/*
455		 * trylock + unlock semantics:
456		 */
457		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
458
459		rc = sk_backlog_rcv(sk, skb);
460
461		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
462	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
463		bh_unlock_sock(sk);
464		atomic_inc(&sk->sk_drops);
465		goto discard_and_relse;
466	}
467
468	bh_unlock_sock(sk);
469out:
470	sock_put(sk);
471	return rc;
472discard_and_relse:
473	kfree_skb(skb);
474	goto out;
475}
476EXPORT_SYMBOL(sk_receive_skb);
477
478void sk_reset_txq(struct sock *sk)
479{
480	sk_tx_queue_clear(sk);
481}
482EXPORT_SYMBOL(sk_reset_txq);
483
484struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
485{
486	struct dst_entry *dst = __sk_dst_get(sk);
487
488	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
489		sk_tx_queue_clear(sk);
490		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
491		dst_release(dst);
492		return NULL;
493	}
494
495	return dst;
496}
497EXPORT_SYMBOL(__sk_dst_check);
498
499struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
500{
501	struct dst_entry *dst = sk_dst_get(sk);
502
503	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
504		sk_dst_reset(sk);
505		dst_release(dst);
506		return NULL;
507	}
508
509	return dst;
510}
511EXPORT_SYMBOL(sk_dst_check);
512
513static int sock_setbindtodevice(struct sock *sk, char __user *optval,
514				int optlen)
515{
516	int ret = -ENOPROTOOPT;
517#ifdef CONFIG_NETDEVICES
518	struct net *net = sock_net(sk);
519	char devname[IFNAMSIZ];
520	int index;
521
522	/* Sorry... */
523	ret = -EPERM;
524	if (!ns_capable(net->user_ns, CAP_NET_RAW))
525		goto out;
526
527	ret = -EINVAL;
528	if (optlen < 0)
529		goto out;
530
531	/* Bind this socket to a particular device like "eth0",
532	 * as specified in the passed interface name. If the
533	 * name is "" or the option length is zero the socket
534	 * is not bound.
535	 */
536	if (optlen > IFNAMSIZ - 1)
537		optlen = IFNAMSIZ - 1;
538	memset(devname, 0, sizeof(devname));
539
540	ret = -EFAULT;
541	if (copy_from_user(devname, optval, optlen))
542		goto out;
543
544	index = 0;
545	if (devname[0] != '\0') {
546		struct net_device *dev;
547
548		rcu_read_lock();
549		dev = dev_get_by_name_rcu(net, devname);
550		if (dev)
551			index = dev->ifindex;
552		rcu_read_unlock();
553		ret = -ENODEV;
554		if (!dev)
555			goto out;
556	}
557
558	lock_sock(sk);
559	sk->sk_bound_dev_if = index;
560	sk_dst_reset(sk);
561	release_sock(sk);
562
563	ret = 0;
564
565out:
566#endif
567
568	return ret;
569}
570
571static int sock_getbindtodevice(struct sock *sk, char __user *optval,
572				int __user *optlen, int len)
573{
574	int ret = -ENOPROTOOPT;
575#ifdef CONFIG_NETDEVICES
576	struct net *net = sock_net(sk);
577	char devname[IFNAMSIZ];
578
579	if (sk->sk_bound_dev_if == 0) {
580		len = 0;
581		goto zero;
582	}
583
584	ret = -EINVAL;
585	if (len < IFNAMSIZ)
586		goto out;
587
588	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
589	if (ret)
590		goto out;
591
592	len = strlen(devname) + 1;
593
594	ret = -EFAULT;
595	if (copy_to_user(optval, devname, len))
596		goto out;
597
598zero:
599	ret = -EFAULT;
600	if (put_user(len, optlen))
601		goto out;
602
603	ret = 0;
604
605out:
606#endif
607
608	return ret;
609}
610
611static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
612{
613	if (valbool)
614		sock_set_flag(sk, bit);
615	else
616		sock_reset_flag(sk, bit);
617}
618
619/*
620 *	This is meant for all protocols to use and covers goings on
621 *	at the socket level. Everything here is generic.
622 */
623
624int sock_setsockopt(struct socket *sock, int level, int optname,
625		    char __user *optval, unsigned int optlen)
626{
627	struct sock *sk = sock->sk;
628	int val;
629	int valbool;
630	struct linger ling;
631	int ret = 0;
632
633	/*
634	 *	Options without arguments
635	 */
636
637	if (optname == SO_BINDTODEVICE)
638		return sock_setbindtodevice(sk, optval, optlen);
639
640	if (optlen < sizeof(int))
641		return -EINVAL;
642
643	if (get_user(val, (int __user *)optval))
644		return -EFAULT;
645
646	valbool = val ? 1 : 0;
647
648	lock_sock(sk);
649
650	switch (optname) {
651	case SO_DEBUG:
652		if (val && !capable(CAP_NET_ADMIN))
653			ret = -EACCES;
654		else
655			sock_valbool_flag(sk, SOCK_DBG, valbool);
656		break;
657	case SO_REUSEADDR:
658		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
659		break;
660	case SO_REUSEPORT:
661		sk->sk_reuseport = valbool;
662		break;
663	case SO_TYPE:
664	case SO_PROTOCOL:
665	case SO_DOMAIN:
666	case SO_ERROR:
667		ret = -ENOPROTOOPT;
668		break;
669	case SO_DONTROUTE:
670		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
671		break;
672	case SO_BROADCAST:
673		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
674		break;
675	case SO_SNDBUF:
676		/* Don't error on this BSD doesn't and if you think
677		 * about it this is right. Otherwise apps have to
678		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
679		 * are treated in BSD as hints
680		 */
681		val = min_t(u32, val, sysctl_wmem_max);
682set_sndbuf:
683		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
684		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
685		/* Wake up sending tasks if we upped the value. */
686		sk->sk_write_space(sk);
687		break;
688
689	case SO_SNDBUFFORCE:
690		if (!capable(CAP_NET_ADMIN)) {
691			ret = -EPERM;
692			break;
693		}
694		goto set_sndbuf;
695
696	case SO_RCVBUF:
697		/* Don't error on this BSD doesn't and if you think
698		 * about it this is right. Otherwise apps have to
699		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
700		 * are treated in BSD as hints
701		 */
702		val = min_t(u32, val, sysctl_rmem_max);
703set_rcvbuf:
704		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
705		/*
706		 * We double it on the way in to account for
707		 * "struct sk_buff" etc. overhead.   Applications
708		 * assume that the SO_RCVBUF setting they make will
709		 * allow that much actual data to be received on that
710		 * socket.
711		 *
712		 * Applications are unaware that "struct sk_buff" and
713		 * other overheads allocate from the receive buffer
714		 * during socket buffer allocation.
715		 *
716		 * And after considering the possible alternatives,
717		 * returning the value we actually used in getsockopt
718		 * is the most desirable behavior.
719		 */
720		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
721		break;
722
723	case SO_RCVBUFFORCE:
724		if (!capable(CAP_NET_ADMIN)) {
725			ret = -EPERM;
726			break;
727		}
728		goto set_rcvbuf;
729
730	case SO_KEEPALIVE:
731#ifdef CONFIG_INET
732		if (sk->sk_protocol == IPPROTO_TCP &&
733		    sk->sk_type == SOCK_STREAM)
734			tcp_set_keepalive(sk, valbool);
735#endif
736		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
737		break;
738
739	case SO_OOBINLINE:
740		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
741		break;
742
743	case SO_NO_CHECK:
744		sk->sk_no_check = valbool;
745		break;
746
747	case SO_PRIORITY:
748		if ((val >= 0 && val <= 6) ||
749		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
750			sk->sk_priority = val;
751		else
752			ret = -EPERM;
753		break;
754
755	case SO_LINGER:
756		if (optlen < sizeof(ling)) {
757			ret = -EINVAL;	/* 1003.1g */
758			break;
759		}
760		if (copy_from_user(&ling, optval, sizeof(ling))) {
761			ret = -EFAULT;
762			break;
763		}
764		if (!ling.l_onoff)
765			sock_reset_flag(sk, SOCK_LINGER);
766		else {
767#if (BITS_PER_LONG == 32)
768			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
769				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
770			else
771#endif
772				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
773			sock_set_flag(sk, SOCK_LINGER);
774		}
775		break;
776
777	case SO_BSDCOMPAT:
778		sock_warn_obsolete_bsdism("setsockopt");
779		break;
780
781	case SO_PASSCRED:
782		if (valbool)
783			set_bit(SOCK_PASSCRED, &sock->flags);
784		else
785			clear_bit(SOCK_PASSCRED, &sock->flags);
786		break;
787
788	case SO_TIMESTAMP:
789	case SO_TIMESTAMPNS:
790		if (valbool)  {
791			if (optname == SO_TIMESTAMP)
792				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
793			else
794				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
795			sock_set_flag(sk, SOCK_RCVTSTAMP);
796			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
797		} else {
798			sock_reset_flag(sk, SOCK_RCVTSTAMP);
799			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
800		}
801		break;
802
803	case SO_TIMESTAMPING:
804		if (val & ~SOF_TIMESTAMPING_MASK) {
805			ret = -EINVAL;
806			break;
807		}
808		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
809				  val & SOF_TIMESTAMPING_TX_HARDWARE);
810		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
811				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
812		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
813				  val & SOF_TIMESTAMPING_RX_HARDWARE);
814		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
815			sock_enable_timestamp(sk,
816					      SOCK_TIMESTAMPING_RX_SOFTWARE);
817		else
818			sock_disable_timestamp(sk,
819					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
820		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
821				  val & SOF_TIMESTAMPING_SOFTWARE);
822		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
823				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
824		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
825				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
826		break;
827
828	case SO_RCVLOWAT:
829		if (val < 0)
830			val = INT_MAX;
831		sk->sk_rcvlowat = val ? : 1;
832		break;
833
834	case SO_RCVTIMEO:
835		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
836		break;
837
838	case SO_SNDTIMEO:
839		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
840		break;
841
842	case SO_ATTACH_FILTER:
843		ret = -EINVAL;
844		if (optlen == sizeof(struct sock_fprog)) {
845			struct sock_fprog fprog;
846
847			ret = -EFAULT;
848			if (copy_from_user(&fprog, optval, sizeof(fprog)))
849				break;
850
851			ret = sk_attach_filter(&fprog, sk);
852		}
853		break;
854
855	case SO_DETACH_FILTER:
856		ret = sk_detach_filter(sk);
857		break;
858
859	case SO_LOCK_FILTER:
860		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
861			ret = -EPERM;
862		else
863			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
864		break;
865
866	case SO_PASSSEC:
867		if (valbool)
868			set_bit(SOCK_PASSSEC, &sock->flags);
869		else
870			clear_bit(SOCK_PASSSEC, &sock->flags);
871		break;
872	case SO_MARK:
873		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
874			ret = -EPERM;
875		else
876			sk->sk_mark = val;
877		break;
878
879		/* We implement the SO_SNDLOWAT etc to
880		   not be settable (1003.1g 5.3) */
881	case SO_RXQ_OVFL:
882		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
883		break;
884
885	case SO_WIFI_STATUS:
886		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
887		break;
888
889	case SO_PEEK_OFF:
890		if (sock->ops->set_peek_off)
891			sock->ops->set_peek_off(sk, val);
892		else
893			ret = -EOPNOTSUPP;
894		break;
895
896	case SO_NOFCS:
897		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
898		break;
899
900	case SO_SELECT_ERR_QUEUE:
901		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
902		break;
903
904#ifdef CONFIG_NET_RX_BUSY_POLL
905	case SO_BUSY_POLL:
906		/* allow unprivileged users to decrease the value */
907		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
908			ret = -EPERM;
909		else {
910			if (val < 0)
911				ret = -EINVAL;
912			else
913				sk->sk_ll_usec = val;
914		}
915		break;
916#endif
917	default:
918		ret = -ENOPROTOOPT;
919		break;
920	}
921	release_sock(sk);
922	return ret;
923}
924EXPORT_SYMBOL(sock_setsockopt);
925
926
927void cred_to_ucred(struct pid *pid, const struct cred *cred,
928		   struct ucred *ucred)
929{
930	ucred->pid = pid_vnr(pid);
931	ucred->uid = ucred->gid = -1;
932	if (cred) {
933		struct user_namespace *current_ns = current_user_ns();
934
935		ucred->uid = from_kuid_munged(current_ns, cred->euid);
936		ucred->gid = from_kgid_munged(current_ns, cred->egid);
937	}
938}
939EXPORT_SYMBOL_GPL(cred_to_ucred);
940
941int sock_getsockopt(struct socket *sock, int level, int optname,
942		    char __user *optval, int __user *optlen)
943{
944	struct sock *sk = sock->sk;
945
946	union {
947		int val;
948		struct linger ling;
949		struct timeval tm;
950	} v;
951
952	int lv = sizeof(int);
953	int len;
954
955	if (get_user(len, optlen))
956		return -EFAULT;
957	if (len < 0)
958		return -EINVAL;
959
960	memset(&v, 0, sizeof(v));
961
962	switch (optname) {
963	case SO_DEBUG:
964		v.val = sock_flag(sk, SOCK_DBG);
965		break;
966
967	case SO_DONTROUTE:
968		v.val = sock_flag(sk, SOCK_LOCALROUTE);
969		break;
970
971	case SO_BROADCAST:
972		v.val = sock_flag(sk, SOCK_BROADCAST);
973		break;
974
975	case SO_SNDBUF:
976		v.val = sk->sk_sndbuf;
977		break;
978
979	case SO_RCVBUF:
980		v.val = sk->sk_rcvbuf;
981		break;
982
983	case SO_REUSEADDR:
984		v.val = sk->sk_reuse;
985		break;
986
987	case SO_REUSEPORT:
988		v.val = sk->sk_reuseport;
989		break;
990
991	case SO_KEEPALIVE:
992		v.val = sock_flag(sk, SOCK_KEEPOPEN);
993		break;
994
995	case SO_TYPE:
996		v.val = sk->sk_type;
997		break;
998
999	case SO_PROTOCOL:
1000		v.val = sk->sk_protocol;
1001		break;
1002
1003	case SO_DOMAIN:
1004		v.val = sk->sk_family;
1005		break;
1006
1007	case SO_ERROR:
1008		v.val = -sock_error(sk);
1009		if (v.val == 0)
1010			v.val = xchg(&sk->sk_err_soft, 0);
1011		break;
1012
1013	case SO_OOBINLINE:
1014		v.val = sock_flag(sk, SOCK_URGINLINE);
1015		break;
1016
1017	case SO_NO_CHECK:
1018		v.val = sk->sk_no_check;
1019		break;
1020
1021	case SO_PRIORITY:
1022		v.val = sk->sk_priority;
1023		break;
1024
1025	case SO_LINGER:
1026		lv		= sizeof(v.ling);
1027		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1028		v.ling.l_linger	= sk->sk_lingertime / HZ;
1029		break;
1030
1031	case SO_BSDCOMPAT:
1032		sock_warn_obsolete_bsdism("getsockopt");
1033		break;
1034
1035	case SO_TIMESTAMP:
1036		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1037				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1038		break;
1039
1040	case SO_TIMESTAMPNS:
1041		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1042		break;
1043
1044	case SO_TIMESTAMPING:
1045		v.val = 0;
1046		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
1047			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
1048		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
1049			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
1050		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
1051			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
1052		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1053			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
1054		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
1055			v.val |= SOF_TIMESTAMPING_SOFTWARE;
1056		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
1057			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
1058		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
1059			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
1060		break;
1061
1062	case SO_RCVTIMEO:
1063		lv = sizeof(struct timeval);
1064		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1065			v.tm.tv_sec = 0;
1066			v.tm.tv_usec = 0;
1067		} else {
1068			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1069			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1070		}
1071		break;
1072
1073	case SO_SNDTIMEO:
1074		lv = sizeof(struct timeval);
1075		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1076			v.tm.tv_sec = 0;
1077			v.tm.tv_usec = 0;
1078		} else {
1079			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1080			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1081		}
1082		break;
1083
1084	case SO_RCVLOWAT:
1085		v.val = sk->sk_rcvlowat;
1086		break;
1087
1088	case SO_SNDLOWAT:
1089		v.val = 1;
1090		break;
1091
1092	case SO_PASSCRED:
1093		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1094		break;
1095
1096	case SO_PEERCRED:
1097	{
1098		struct ucred peercred;
1099		if (len > sizeof(peercred))
1100			len = sizeof(peercred);
1101		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1102		if (copy_to_user(optval, &peercred, len))
1103			return -EFAULT;
1104		goto lenout;
1105	}
1106
1107	case SO_PEERNAME:
1108	{
1109		char address[128];
1110
1111		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1112			return -ENOTCONN;
1113		if (lv < len)
1114			return -EINVAL;
1115		if (copy_to_user(optval, address, len))
1116			return -EFAULT;
1117		goto lenout;
1118	}
1119
1120	/* Dubious BSD thing... Probably nobody even uses it, but
1121	 * the UNIX standard wants it for whatever reason... -DaveM
1122	 */
1123	case SO_ACCEPTCONN:
1124		v.val = sk->sk_state == TCP_LISTEN;
1125		break;
1126
1127	case SO_PASSSEC:
1128		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1129		break;
1130
1131	case SO_PEERSEC:
1132		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1133
1134	case SO_MARK:
1135		v.val = sk->sk_mark;
1136		break;
1137
1138	case SO_RXQ_OVFL:
1139		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1140		break;
1141
1142	case SO_WIFI_STATUS:
1143		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1144		break;
1145
1146	case SO_PEEK_OFF:
1147		if (!sock->ops->set_peek_off)
1148			return -EOPNOTSUPP;
1149
1150		v.val = sk->sk_peek_off;
1151		break;
1152	case SO_NOFCS:
1153		v.val = sock_flag(sk, SOCK_NOFCS);
1154		break;
1155
1156	case SO_BINDTODEVICE:
1157		return sock_getbindtodevice(sk, optval, optlen, len);
1158
1159	case SO_GET_FILTER:
1160		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1161		if (len < 0)
1162			return len;
1163
1164		goto lenout;
1165
1166	case SO_LOCK_FILTER:
1167		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1168		break;
1169
1170	case SO_SELECT_ERR_QUEUE:
1171		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1172		break;
1173
1174#ifdef CONFIG_NET_RX_BUSY_POLL
1175	case SO_BUSY_POLL:
1176		v.val = sk->sk_ll_usec;
1177		break;
1178#endif
1179
1180	default:
1181		return -ENOPROTOOPT;
1182	}
1183
1184	if (len > lv)
1185		len = lv;
1186	if (copy_to_user(optval, &v, len))
1187		return -EFAULT;
1188lenout:
1189	if (put_user(len, optlen))
1190		return -EFAULT;
1191	return 0;
1192}
1193
1194/*
1195 * Initialize an sk_lock.
1196 *
1197 * (We also register the sk_lock with the lock validator.)
1198 */
1199static inline void sock_lock_init(struct sock *sk)
1200{
1201	sock_lock_init_class_and_name(sk,
1202			af_family_slock_key_strings[sk->sk_family],
1203			af_family_slock_keys + sk->sk_family,
1204			af_family_key_strings[sk->sk_family],
1205			af_family_keys + sk->sk_family);
1206}
1207
1208/*
1209 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1210 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1211 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1212 */
1213static void sock_copy(struct sock *nsk, const struct sock *osk)
1214{
1215#ifdef CONFIG_SECURITY_NETWORK
1216	void *sptr = nsk->sk_security;
1217#endif
1218	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1219
1220	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1221	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1222
1223#ifdef CONFIG_SECURITY_NETWORK
1224	nsk->sk_security = sptr;
1225	security_sk_clone(osk, nsk);
1226#endif
1227}
1228
1229void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1230{
1231	unsigned long nulls1, nulls2;
1232
1233	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1234	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1235	if (nulls1 > nulls2)
1236		swap(nulls1, nulls2);
1237
1238	if (nulls1 != 0)
1239		memset((char *)sk, 0, nulls1);
1240	memset((char *)sk + nulls1 + sizeof(void *), 0,
1241	       nulls2 - nulls1 - sizeof(void *));
1242	memset((char *)sk + nulls2 + sizeof(void *), 0,
1243	       size - nulls2 - sizeof(void *));
1244}
1245EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1246
1247static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1248		int family)
1249{
1250	struct sock *sk;
1251	struct kmem_cache *slab;
1252
1253	slab = prot->slab;
1254	if (slab != NULL) {
1255		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1256		if (!sk)
1257			return sk;
1258		if (priority & __GFP_ZERO) {
1259			if (prot->clear_sk)
1260				prot->clear_sk(sk, prot->obj_size);
1261			else
1262				sk_prot_clear_nulls(sk, prot->obj_size);
1263		}
1264	} else
1265		sk = kmalloc(prot->obj_size, priority);
1266
1267	if (sk != NULL) {
1268		kmemcheck_annotate_bitfield(sk, flags);
1269
1270		if (security_sk_alloc(sk, family, priority))
1271			goto out_free;
1272
1273		if (!try_module_get(prot->owner))
1274			goto out_free_sec;
1275		sk_tx_queue_clear(sk);
1276	}
1277
1278	return sk;
1279
1280out_free_sec:
1281	security_sk_free(sk);
1282out_free:
1283	if (slab != NULL)
1284		kmem_cache_free(slab, sk);
1285	else
1286		kfree(sk);
1287	return NULL;
1288}
1289
1290static void sk_prot_free(struct proto *prot, struct sock *sk)
1291{
1292	struct kmem_cache *slab;
1293	struct module *owner;
1294
1295	owner = prot->owner;
1296	slab = prot->slab;
1297
1298	security_sk_free(sk);
1299	if (slab != NULL)
1300		kmem_cache_free(slab, sk);
1301	else
1302		kfree(sk);
1303	module_put(owner);
1304}
1305
1306#if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
1307void sock_update_classid(struct sock *sk)
1308{
1309	u32 classid;
1310
1311	classid = task_cls_classid(current);
1312	if (classid != sk->sk_classid)
1313		sk->sk_classid = classid;
1314}
1315EXPORT_SYMBOL(sock_update_classid);
1316#endif
1317
1318#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1319void sock_update_netprioidx(struct sock *sk)
1320{
1321	if (in_interrupt())
1322		return;
1323
1324	sk->sk_cgrp_prioidx = task_netprioidx(current);
1325}
1326EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1327#endif
1328
1329/**
1330 *	sk_alloc - All socket objects are allocated here
1331 *	@net: the applicable net namespace
1332 *	@family: protocol family
1333 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1334 *	@prot: struct proto associated with this new sock instance
1335 */
1336struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1337		      struct proto *prot)
1338{
1339	struct sock *sk;
1340
1341	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1342	if (sk) {
1343		sk->sk_family = family;
1344		/*
1345		 * See comment in struct sock definition to understand
1346		 * why we need sk_prot_creator -acme
1347		 */
1348		sk->sk_prot = sk->sk_prot_creator = prot;
1349		sock_lock_init(sk);
1350		sock_net_set(sk, get_net(net));
1351		atomic_set(&sk->sk_wmem_alloc, 1);
1352
1353		sock_update_classid(sk);
1354		sock_update_netprioidx(sk);
1355	}
1356
1357	return sk;
1358}
1359EXPORT_SYMBOL(sk_alloc);
1360
1361static void __sk_free(struct sock *sk)
1362{
1363	struct sk_filter *filter;
1364
1365	if (sk->sk_destruct)
1366		sk->sk_destruct(sk);
1367
1368	filter = rcu_dereference_check(sk->sk_filter,
1369				       atomic_read(&sk->sk_wmem_alloc) == 0);
1370	if (filter) {
1371		sk_filter_uncharge(sk, filter);
1372		RCU_INIT_POINTER(sk->sk_filter, NULL);
1373	}
1374
1375	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1376
1377	if (atomic_read(&sk->sk_omem_alloc))
1378		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1379			 __func__, atomic_read(&sk->sk_omem_alloc));
1380
1381	if (sk->sk_peer_cred)
1382		put_cred(sk->sk_peer_cred);
1383	put_pid(sk->sk_peer_pid);
1384	put_net(sock_net(sk));
1385	sk_prot_free(sk->sk_prot_creator, sk);
1386}
1387
1388void sk_free(struct sock *sk)
1389{
1390	/*
1391	 * We subtract one from sk_wmem_alloc and can know if
1392	 * some packets are still in some tx queue.
1393	 * If not null, sock_wfree() will call __sk_free(sk) later
1394	 */
1395	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1396		__sk_free(sk);
1397}
1398EXPORT_SYMBOL(sk_free);
1399
1400/*
1401 * Last sock_put should drop reference to sk->sk_net. It has already
1402 * been dropped in sk_change_net. Taking reference to stopping namespace
1403 * is not an option.
1404 * Take reference to a socket to remove it from hash _alive_ and after that
1405 * destroy it in the context of init_net.
1406 */
1407void sk_release_kernel(struct sock *sk)
1408{
1409	if (sk == NULL || sk->sk_socket == NULL)
1410		return;
1411
1412	sock_hold(sk);
1413	sock_release(sk->sk_socket);
1414	release_net(sock_net(sk));
1415	sock_net_set(sk, get_net(&init_net));
1416	sock_put(sk);
1417}
1418EXPORT_SYMBOL(sk_release_kernel);
1419
1420static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1421{
1422	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1423		sock_update_memcg(newsk);
1424}
1425
1426/**
1427 *	sk_clone_lock - clone a socket, and lock its clone
1428 *	@sk: the socket to clone
1429 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1430 *
1431 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1432 */
1433struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1434{
1435	struct sock *newsk;
1436
1437	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1438	if (newsk != NULL) {
1439		struct sk_filter *filter;
1440
1441		sock_copy(newsk, sk);
1442
1443		/* SANITY */
1444		get_net(sock_net(newsk));
1445		sk_node_init(&newsk->sk_node);
1446		sock_lock_init(newsk);
1447		bh_lock_sock(newsk);
1448		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1449		newsk->sk_backlog.len = 0;
1450
1451		atomic_set(&newsk->sk_rmem_alloc, 0);
1452		/*
1453		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1454		 */
1455		atomic_set(&newsk->sk_wmem_alloc, 1);
1456		atomic_set(&newsk->sk_omem_alloc, 0);
1457		skb_queue_head_init(&newsk->sk_receive_queue);
1458		skb_queue_head_init(&newsk->sk_write_queue);
1459#ifdef CONFIG_NET_DMA
1460		skb_queue_head_init(&newsk->sk_async_wait_queue);
1461#endif
1462
1463		spin_lock_init(&newsk->sk_dst_lock);
1464		rwlock_init(&newsk->sk_callback_lock);
1465		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1466				af_callback_keys + newsk->sk_family,
1467				af_family_clock_key_strings[newsk->sk_family]);
1468
1469		newsk->sk_dst_cache	= NULL;
1470		newsk->sk_wmem_queued	= 0;
1471		newsk->sk_forward_alloc = 0;
1472		newsk->sk_send_head	= NULL;
1473		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1474
1475		sock_reset_flag(newsk, SOCK_DONE);
1476		skb_queue_head_init(&newsk->sk_error_queue);
1477
1478		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1479		if (filter != NULL)
1480			sk_filter_charge(newsk, filter);
1481
1482		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1483			/* It is still raw copy of parent, so invalidate
1484			 * destructor and make plain sk_free() */
1485			newsk->sk_destruct = NULL;
1486			bh_unlock_sock(newsk);
1487			sk_free(newsk);
1488			newsk = NULL;
1489			goto out;
1490		}
1491
1492		newsk->sk_err	   = 0;
1493		newsk->sk_priority = 0;
1494		/*
1495		 * Before updating sk_refcnt, we must commit prior changes to memory
1496		 * (Documentation/RCU/rculist_nulls.txt for details)
1497		 */
1498		smp_wmb();
1499		atomic_set(&newsk->sk_refcnt, 2);
1500
1501		/*
1502		 * Increment the counter in the same struct proto as the master
1503		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1504		 * is the same as sk->sk_prot->socks, as this field was copied
1505		 * with memcpy).
1506		 *
1507		 * This _changes_ the previous behaviour, where
1508		 * tcp_create_openreq_child always was incrementing the
1509		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1510		 * to be taken into account in all callers. -acme
1511		 */
1512		sk_refcnt_debug_inc(newsk);
1513		sk_set_socket(newsk, NULL);
1514		newsk->sk_wq = NULL;
1515
1516		sk_update_clone(sk, newsk);
1517
1518		if (newsk->sk_prot->sockets_allocated)
1519			sk_sockets_allocated_inc(newsk);
1520
1521		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1522			net_enable_timestamp();
1523	}
1524out:
1525	return newsk;
1526}
1527EXPORT_SYMBOL_GPL(sk_clone_lock);
1528
1529void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1530{
1531	__sk_dst_set(sk, dst);
1532	sk->sk_route_caps = dst->dev->features;
1533	if (sk->sk_route_caps & NETIF_F_GSO)
1534		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1535	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1536	if (sk_can_gso(sk)) {
1537		if (dst->header_len) {
1538			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1539		} else {
1540			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1541			sk->sk_gso_max_size = dst->dev->gso_max_size;
1542			sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1543		}
1544	}
1545}
1546EXPORT_SYMBOL_GPL(sk_setup_caps);
1547
1548/*
1549 *	Simple resource managers for sockets.
1550 */
1551
1552
1553/*
1554 * Write buffer destructor automatically called from kfree_skb.
1555 */
1556void sock_wfree(struct sk_buff *skb)
1557{
1558	struct sock *sk = skb->sk;
1559	unsigned int len = skb->truesize;
1560
1561	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1562		/*
1563		 * Keep a reference on sk_wmem_alloc, this will be released
1564		 * after sk_write_space() call
1565		 */
1566		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1567		sk->sk_write_space(sk);
1568		len = 1;
1569	}
1570	/*
1571	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1572	 * could not do because of in-flight packets
1573	 */
1574	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1575		__sk_free(sk);
1576}
1577EXPORT_SYMBOL(sock_wfree);
1578
1579void skb_orphan_partial(struct sk_buff *skb)
1580{
1581	/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1582	 * so we do not completely orphan skb, but transfert all
1583	 * accounted bytes but one, to avoid unexpected reorders.
1584	 */
1585	if (skb->destructor == sock_wfree
1586#ifdef CONFIG_INET
1587	    || skb->destructor == tcp_wfree
1588#endif
1589		) {
1590		atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1591		skb->truesize = 1;
1592	} else {
1593		skb_orphan(skb);
1594	}
1595}
1596EXPORT_SYMBOL(skb_orphan_partial);
1597
1598/*
1599 * Read buffer destructor automatically called from kfree_skb.
1600 */
1601void sock_rfree(struct sk_buff *skb)
1602{
1603	struct sock *sk = skb->sk;
1604	unsigned int len = skb->truesize;
1605
1606	atomic_sub(len, &sk->sk_rmem_alloc);
1607	sk_mem_uncharge(sk, len);
1608}
1609EXPORT_SYMBOL(sock_rfree);
1610
1611void sock_edemux(struct sk_buff *skb)
1612{
1613	struct sock *sk = skb->sk;
1614
1615#ifdef CONFIG_INET
1616	if (sk->sk_state == TCP_TIME_WAIT)
1617		inet_twsk_put(inet_twsk(sk));
1618	else
1619#endif
1620		sock_put(sk);
1621}
1622EXPORT_SYMBOL(sock_edemux);
1623
1624kuid_t sock_i_uid(struct sock *sk)
1625{
1626	kuid_t uid;
1627
1628	read_lock_bh(&sk->sk_callback_lock);
1629	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1630	read_unlock_bh(&sk->sk_callback_lock);
1631	return uid;
1632}
1633EXPORT_SYMBOL(sock_i_uid);
1634
1635unsigned long sock_i_ino(struct sock *sk)
1636{
1637	unsigned long ino;
1638
1639	read_lock_bh(&sk->sk_callback_lock);
1640	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1641	read_unlock_bh(&sk->sk_callback_lock);
1642	return ino;
1643}
1644EXPORT_SYMBOL(sock_i_ino);
1645
1646/*
1647 * Allocate a skb from the socket's send buffer.
1648 */
1649struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1650			     gfp_t priority)
1651{
1652	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1653		struct sk_buff *skb = alloc_skb(size, priority);
1654		if (skb) {
1655			skb_set_owner_w(skb, sk);
1656			return skb;
1657		}
1658	}
1659	return NULL;
1660}
1661EXPORT_SYMBOL(sock_wmalloc);
1662
1663/*
1664 * Allocate a skb from the socket's receive buffer.
1665 */
1666struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1667			     gfp_t priority)
1668{
1669	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1670		struct sk_buff *skb = alloc_skb(size, priority);
1671		if (skb) {
1672			skb_set_owner_r(skb, sk);
1673			return skb;
1674		}
1675	}
1676	return NULL;
1677}
1678
1679/*
1680 * Allocate a memory block from the socket's option memory buffer.
1681 */
1682void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1683{
1684	if ((unsigned int)size <= sysctl_optmem_max &&
1685	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1686		void *mem;
1687		/* First do the add, to avoid the race if kmalloc
1688		 * might sleep.
1689		 */
1690		atomic_add(size, &sk->sk_omem_alloc);
1691		mem = kmalloc(size, priority);
1692		if (mem)
1693			return mem;
1694		atomic_sub(size, &sk->sk_omem_alloc);
1695	}
1696	return NULL;
1697}
1698EXPORT_SYMBOL(sock_kmalloc);
1699
1700/*
1701 * Free an option memory block.
1702 */
1703void sock_kfree_s(struct sock *sk, void *mem, int size)
1704{
1705	kfree(mem);
1706	atomic_sub(size, &sk->sk_omem_alloc);
1707}
1708EXPORT_SYMBOL(sock_kfree_s);
1709
1710/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1711   I think, these locks should be removed for datagram sockets.
1712 */
1713static long sock_wait_for_wmem(struct sock *sk, long timeo)
1714{
1715	DEFINE_WAIT(wait);
1716
1717	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1718	for (;;) {
1719		if (!timeo)
1720			break;
1721		if (signal_pending(current))
1722			break;
1723		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1724		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1725		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1726			break;
1727		if (sk->sk_shutdown & SEND_SHUTDOWN)
1728			break;
1729		if (sk->sk_err)
1730			break;
1731		timeo = schedule_timeout(timeo);
1732	}
1733	finish_wait(sk_sleep(sk), &wait);
1734	return timeo;
1735}
1736
1737
1738/*
1739 *	Generic send/receive buffer handlers
1740 */
1741
1742struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1743				     unsigned long data_len, int noblock,
1744				     int *errcode, int max_page_order)
1745{
1746	struct sk_buff *skb = NULL;
1747	unsigned long chunk;
1748	gfp_t gfp_mask;
1749	long timeo;
1750	int err;
1751	int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1752	struct page *page;
1753	int i;
1754
1755	err = -EMSGSIZE;
1756	if (npages > MAX_SKB_FRAGS)
1757		goto failure;
1758
1759	timeo = sock_sndtimeo(sk, noblock);
1760	while (!skb) {
1761		err = sock_error(sk);
1762		if (err != 0)
1763			goto failure;
1764
1765		err = -EPIPE;
1766		if (sk->sk_shutdown & SEND_SHUTDOWN)
1767			goto failure;
1768
1769		if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) {
1770			set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1771			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1772			err = -EAGAIN;
1773			if (!timeo)
1774				goto failure;
1775			if (signal_pending(current))
1776				goto interrupted;
1777			timeo = sock_wait_for_wmem(sk, timeo);
1778			continue;
1779		}
1780
1781		err = -ENOBUFS;
1782		gfp_mask = sk->sk_allocation;
1783		if (gfp_mask & __GFP_WAIT)
1784			gfp_mask |= __GFP_REPEAT;
1785
1786		skb = alloc_skb(header_len, gfp_mask);
1787		if (!skb)
1788			goto failure;
1789
1790		skb->truesize += data_len;
1791
1792		for (i = 0; npages > 0; i++) {
1793			int order = max_page_order;
1794
1795			while (order) {
1796				if (npages >= 1 << order) {
1797					page = alloc_pages(sk->sk_allocation |
1798							   __GFP_COMP | __GFP_NOWARN,
1799							   order);
1800					if (page)
1801						goto fill_page;
1802				}
1803				order--;
1804			}
1805			page = alloc_page(sk->sk_allocation);
1806			if (!page)
1807				goto failure;
1808fill_page:
1809			chunk = min_t(unsigned long, data_len,
1810				      PAGE_SIZE << order);
1811			skb_fill_page_desc(skb, i, page, 0, chunk);
1812			data_len -= chunk;
1813			npages -= 1 << order;
1814		}
1815	}
1816
1817	skb_set_owner_w(skb, sk);
1818	return skb;
1819
1820interrupted:
1821	err = sock_intr_errno(timeo);
1822failure:
1823	kfree_skb(skb);
1824	*errcode = err;
1825	return NULL;
1826}
1827EXPORT_SYMBOL(sock_alloc_send_pskb);
1828
1829struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1830				    int noblock, int *errcode)
1831{
1832	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1833}
1834EXPORT_SYMBOL(sock_alloc_send_skb);
1835
1836/* On 32bit arches, an skb frag is limited to 2^15 */
1837#define SKB_FRAG_PAGE_ORDER	get_order(32768)
1838
1839bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1840{
1841	int order;
1842
1843	if (pfrag->page) {
1844		if (atomic_read(&pfrag->page->_count) == 1) {
1845			pfrag->offset = 0;
1846			return true;
1847		}
1848		if (pfrag->offset < pfrag->size)
1849			return true;
1850		put_page(pfrag->page);
1851	}
1852
1853	/* We restrict high order allocations to users that can afford to wait */
1854	order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1855
1856	do {
1857		gfp_t gfp = sk->sk_allocation;
1858
1859		if (order)
1860			gfp |= __GFP_COMP | __GFP_NOWARN;
1861		pfrag->page = alloc_pages(gfp, order);
1862		if (likely(pfrag->page)) {
1863			pfrag->offset = 0;
1864			pfrag->size = PAGE_SIZE << order;
1865			return true;
1866		}
1867	} while (--order >= 0);
1868
1869	sk_enter_memory_pressure(sk);
1870	sk_stream_moderate_sndbuf(sk);
1871	return false;
1872}
1873EXPORT_SYMBOL(sk_page_frag_refill);
1874
1875static void __lock_sock(struct sock *sk)
1876	__releases(&sk->sk_lock.slock)
1877	__acquires(&sk->sk_lock.slock)
1878{
1879	DEFINE_WAIT(wait);
1880
1881	for (;;) {
1882		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1883					TASK_UNINTERRUPTIBLE);
1884		spin_unlock_bh(&sk->sk_lock.slock);
1885		schedule();
1886		spin_lock_bh(&sk->sk_lock.slock);
1887		if (!sock_owned_by_user(sk))
1888			break;
1889	}
1890	finish_wait(&sk->sk_lock.wq, &wait);
1891}
1892
1893static void __release_sock(struct sock *sk)
1894	__releases(&sk->sk_lock.slock)
1895	__acquires(&sk->sk_lock.slock)
1896{
1897	struct sk_buff *skb = sk->sk_backlog.head;
1898
1899	do {
1900		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1901		bh_unlock_sock(sk);
1902
1903		do {
1904			struct sk_buff *next = skb->next;
1905
1906			prefetch(next);
1907			WARN_ON_ONCE(skb_dst_is_noref(skb));
1908			skb->next = NULL;
1909			sk_backlog_rcv(sk, skb);
1910
1911			/*
1912			 * We are in process context here with softirqs
1913			 * disabled, use cond_resched_softirq() to preempt.
1914			 * This is safe to do because we've taken the backlog
1915			 * queue private:
1916			 */
1917			cond_resched_softirq();
1918
1919			skb = next;
1920		} while (skb != NULL);
1921
1922		bh_lock_sock(sk);
1923	} while ((skb = sk->sk_backlog.head) != NULL);
1924
1925	/*
1926	 * Doing the zeroing here guarantee we can not loop forever
1927	 * while a wild producer attempts to flood us.
1928	 */
1929	sk->sk_backlog.len = 0;
1930}
1931
1932/**
1933 * sk_wait_data - wait for data to arrive at sk_receive_queue
1934 * @sk:    sock to wait on
1935 * @timeo: for how long
1936 *
1937 * Now socket state including sk->sk_err is changed only under lock,
1938 * hence we may omit checks after joining wait queue.
1939 * We check receive queue before schedule() only as optimization;
1940 * it is very likely that release_sock() added new data.
1941 */
1942int sk_wait_data(struct sock *sk, long *timeo)
1943{
1944	int rc;
1945	DEFINE_WAIT(wait);
1946
1947	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1948	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1949	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1950	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1951	finish_wait(sk_sleep(sk), &wait);
1952	return rc;
1953}
1954EXPORT_SYMBOL(sk_wait_data);
1955
1956/**
1957 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1958 *	@sk: socket
1959 *	@size: memory size to allocate
1960 *	@kind: allocation type
1961 *
1962 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1963 *	rmem allocation. This function assumes that protocols which have
1964 *	memory_pressure use sk_wmem_queued as write buffer accounting.
1965 */
1966int __sk_mem_schedule(struct sock *sk, int size, int kind)
1967{
1968	struct proto *prot = sk->sk_prot;
1969	int amt = sk_mem_pages(size);
1970	long allocated;
1971	int parent_status = UNDER_LIMIT;
1972
1973	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1974
1975	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1976
1977	/* Under limit. */
1978	if (parent_status == UNDER_LIMIT &&
1979			allocated <= sk_prot_mem_limits(sk, 0)) {
1980		sk_leave_memory_pressure(sk);
1981		return 1;
1982	}
1983
1984	/* Under pressure. (we or our parents) */
1985	if ((parent_status > SOFT_LIMIT) ||
1986			allocated > sk_prot_mem_limits(sk, 1))
1987		sk_enter_memory_pressure(sk);
1988
1989	/* Over hard limit (we or our parents) */
1990	if ((parent_status == OVER_LIMIT) ||
1991			(allocated > sk_prot_mem_limits(sk, 2)))
1992		goto suppress_allocation;
1993
1994	/* guarantee minimum buffer size under pressure */
1995	if (kind == SK_MEM_RECV) {
1996		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1997			return 1;
1998
1999	} else { /* SK_MEM_SEND */
2000		if (sk->sk_type == SOCK_STREAM) {
2001			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2002				return 1;
2003		} else if (atomic_read(&sk->sk_wmem_alloc) <
2004			   prot->sysctl_wmem[0])
2005				return 1;
2006	}
2007
2008	if (sk_has_memory_pressure(sk)) {
2009		int alloc;
2010
2011		if (!sk_under_memory_pressure(sk))
2012			return 1;
2013		alloc = sk_sockets_allocated_read_positive(sk);
2014		if (sk_prot_mem_limits(sk, 2) > alloc *
2015		    sk_mem_pages(sk->sk_wmem_queued +
2016				 atomic_read(&sk->sk_rmem_alloc) +
2017				 sk->sk_forward_alloc))
2018			return 1;
2019	}
2020
2021suppress_allocation:
2022
2023	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2024		sk_stream_moderate_sndbuf(sk);
2025
2026		/* Fail only if socket is _under_ its sndbuf.
2027		 * In this case we cannot block, so that we have to fail.
2028		 */
2029		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2030			return 1;
2031	}
2032
2033	trace_sock_exceed_buf_limit(sk, prot, allocated);
2034
2035	/* Alas. Undo changes. */
2036	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2037
2038	sk_memory_allocated_sub(sk, amt);
2039
2040	return 0;
2041}
2042EXPORT_SYMBOL(__sk_mem_schedule);
2043
2044/**
2045 *	__sk_reclaim - reclaim memory_allocated
2046 *	@sk: socket
2047 */
2048void __sk_mem_reclaim(struct sock *sk)
2049{
2050	sk_memory_allocated_sub(sk,
2051				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2052	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2053
2054	if (sk_under_memory_pressure(sk) &&
2055	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2056		sk_leave_memory_pressure(sk);
2057}
2058EXPORT_SYMBOL(__sk_mem_reclaim);
2059
2060
2061/*
2062 * Set of default routines for initialising struct proto_ops when
2063 * the protocol does not support a particular function. In certain
2064 * cases where it makes no sense for a protocol to have a "do nothing"
2065 * function, some default processing is provided.
2066 */
2067
2068int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2069{
2070	return -EOPNOTSUPP;
2071}
2072EXPORT_SYMBOL(sock_no_bind);
2073
2074int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2075		    int len, int flags)
2076{
2077	return -EOPNOTSUPP;
2078}
2079EXPORT_SYMBOL(sock_no_connect);
2080
2081int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2082{
2083	return -EOPNOTSUPP;
2084}
2085EXPORT_SYMBOL(sock_no_socketpair);
2086
2087int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2088{
2089	return -EOPNOTSUPP;
2090}
2091EXPORT_SYMBOL(sock_no_accept);
2092
2093int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2094		    int *len, int peer)
2095{
2096	return -EOPNOTSUPP;
2097}
2098EXPORT_SYMBOL(sock_no_getname);
2099
2100unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2101{
2102	return 0;
2103}
2104EXPORT_SYMBOL(sock_no_poll);
2105
2106int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2107{
2108	return -EOPNOTSUPP;
2109}
2110EXPORT_SYMBOL(sock_no_ioctl);
2111
2112int sock_no_listen(struct socket *sock, int backlog)
2113{
2114	return -EOPNOTSUPP;
2115}
2116EXPORT_SYMBOL(sock_no_listen);
2117
2118int sock_no_shutdown(struct socket *sock, int how)
2119{
2120	return -EOPNOTSUPP;
2121}
2122EXPORT_SYMBOL(sock_no_shutdown);
2123
2124int sock_no_setsockopt(struct socket *sock, int level, int optname,
2125		    char __user *optval, unsigned int optlen)
2126{
2127	return -EOPNOTSUPP;
2128}
2129EXPORT_SYMBOL(sock_no_setsockopt);
2130
2131int sock_no_getsockopt(struct socket *sock, int level, int optname,
2132		    char __user *optval, int __user *optlen)
2133{
2134	return -EOPNOTSUPP;
2135}
2136EXPORT_SYMBOL(sock_no_getsockopt);
2137
2138int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2139		    size_t len)
2140{
2141	return -EOPNOTSUPP;
2142}
2143EXPORT_SYMBOL(sock_no_sendmsg);
2144
2145int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2146		    size_t len, int flags)
2147{
2148	return -EOPNOTSUPP;
2149}
2150EXPORT_SYMBOL(sock_no_recvmsg);
2151
2152int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2153{
2154	/* Mirror missing mmap method error code */
2155	return -ENODEV;
2156}
2157EXPORT_SYMBOL(sock_no_mmap);
2158
2159ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2160{
2161	ssize_t res;
2162	struct msghdr msg = {.msg_flags = flags};
2163	struct kvec iov;
2164	char *kaddr = kmap(page);
2165	iov.iov_base = kaddr + offset;
2166	iov.iov_len = size;
2167	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2168	kunmap(page);
2169	return res;
2170}
2171EXPORT_SYMBOL(sock_no_sendpage);
2172
2173/*
2174 *	Default Socket Callbacks
2175 */
2176
2177static void sock_def_wakeup(struct sock *sk)
2178{
2179	struct socket_wq *wq;
2180
2181	rcu_read_lock();
2182	wq = rcu_dereference(sk->sk_wq);
2183	if (wq_has_sleeper(wq))
2184		wake_up_interruptible_all(&wq->wait);
2185	rcu_read_unlock();
2186}
2187
2188static void sock_def_error_report(struct sock *sk)
2189{
2190	struct socket_wq *wq;
2191
2192	rcu_read_lock();
2193	wq = rcu_dereference(sk->sk_wq);
2194	if (wq_has_sleeper(wq))
2195		wake_up_interruptible_poll(&wq->wait, POLLERR);
2196	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2197	rcu_read_unlock();
2198}
2199
2200static void sock_def_readable(struct sock *sk, int len)
2201{
2202	struct socket_wq *wq;
2203
2204	rcu_read_lock();
2205	wq = rcu_dereference(sk->sk_wq);
2206	if (wq_has_sleeper(wq))
2207		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2208						POLLRDNORM | POLLRDBAND);
2209	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2210	rcu_read_unlock();
2211}
2212
2213static void sock_def_write_space(struct sock *sk)
2214{
2215	struct socket_wq *wq;
2216
2217	rcu_read_lock();
2218
2219	/* Do not wake up a writer until he can make "significant"
2220	 * progress.  --DaveM
2221	 */
2222	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2223		wq = rcu_dereference(sk->sk_wq);
2224		if (wq_has_sleeper(wq))
2225			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2226						POLLWRNORM | POLLWRBAND);
2227
2228		/* Should agree with poll, otherwise some programs break */
2229		if (sock_writeable(sk))
2230			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2231	}
2232
2233	rcu_read_unlock();
2234}
2235
2236static void sock_def_destruct(struct sock *sk)
2237{
2238	kfree(sk->sk_protinfo);
2239}
2240
2241void sk_send_sigurg(struct sock *sk)
2242{
2243	if (sk->sk_socket && sk->sk_socket->file)
2244		if (send_sigurg(&sk->sk_socket->file->f_owner))
2245			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2246}
2247EXPORT_SYMBOL(sk_send_sigurg);
2248
2249void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2250		    unsigned long expires)
2251{
2252	if (!mod_timer(timer, expires))
2253		sock_hold(sk);
2254}
2255EXPORT_SYMBOL(sk_reset_timer);
2256
2257void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2258{
2259	if (del_timer(timer))
2260		__sock_put(sk);
2261}
2262EXPORT_SYMBOL(sk_stop_timer);
2263
2264void sock_init_data(struct socket *sock, struct sock *sk)
2265{
2266	skb_queue_head_init(&sk->sk_receive_queue);
2267	skb_queue_head_init(&sk->sk_write_queue);
2268	skb_queue_head_init(&sk->sk_error_queue);
2269#ifdef CONFIG_NET_DMA
2270	skb_queue_head_init(&sk->sk_async_wait_queue);
2271#endif
2272
2273	sk->sk_send_head	=	NULL;
2274
2275	init_timer(&sk->sk_timer);
2276
2277	sk->sk_allocation	=	GFP_KERNEL;
2278	sk->sk_rcvbuf		=	sysctl_rmem_default;
2279	sk->sk_sndbuf		=	sysctl_wmem_default;
2280	sk->sk_state		=	TCP_CLOSE;
2281	sk_set_socket(sk, sock);
2282
2283	sock_set_flag(sk, SOCK_ZAPPED);
2284
2285	if (sock) {
2286		sk->sk_type	=	sock->type;
2287		sk->sk_wq	=	sock->wq;
2288		sock->sk	=	sk;
2289	} else
2290		sk->sk_wq	=	NULL;
2291
2292	spin_lock_init(&sk->sk_dst_lock);
2293	rwlock_init(&sk->sk_callback_lock);
2294	lockdep_set_class_and_name(&sk->sk_callback_lock,
2295			af_callback_keys + sk->sk_family,
2296			af_family_clock_key_strings[sk->sk_family]);
2297
2298	sk->sk_state_change	=	sock_def_wakeup;
2299	sk->sk_data_ready	=	sock_def_readable;
2300	sk->sk_write_space	=	sock_def_write_space;
2301	sk->sk_error_report	=	sock_def_error_report;
2302	sk->sk_destruct		=	sock_def_destruct;
2303
2304	sk->sk_frag.page	=	NULL;
2305	sk->sk_frag.offset	=	0;
2306	sk->sk_peek_off		=	-1;
2307
2308	sk->sk_peer_pid 	=	NULL;
2309	sk->sk_peer_cred	=	NULL;
2310	sk->sk_write_pending	=	0;
2311	sk->sk_rcvlowat		=	1;
2312	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2313	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2314
2315	sk->sk_stamp = ktime_set(-1L, 0);
2316
2317#ifdef CONFIG_NET_RX_BUSY_POLL
2318	sk->sk_napi_id		=	0;
2319	sk->sk_ll_usec		=	sysctl_net_busy_read;
2320#endif
2321
2322	sk->sk_pacing_rate = ~0U;
2323	/*
2324	 * Before updating sk_refcnt, we must commit prior changes to memory
2325	 * (Documentation/RCU/rculist_nulls.txt for details)
2326	 */
2327	smp_wmb();
2328	atomic_set(&sk->sk_refcnt, 1);
2329	atomic_set(&sk->sk_drops, 0);
2330}
2331EXPORT_SYMBOL(sock_init_data);
2332
2333void lock_sock_nested(struct sock *sk, int subclass)
2334{
2335	might_sleep();
2336	spin_lock_bh(&sk->sk_lock.slock);
2337	if (sk->sk_lock.owned)
2338		__lock_sock(sk);
2339	sk->sk_lock.owned = 1;
2340	spin_unlock(&sk->sk_lock.slock);
2341	/*
2342	 * The sk_lock has mutex_lock() semantics here:
2343	 */
2344	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2345	local_bh_enable();
2346}
2347EXPORT_SYMBOL(lock_sock_nested);
2348
2349void release_sock(struct sock *sk)
2350{
2351	/*
2352	 * The sk_lock has mutex_unlock() semantics:
2353	 */
2354	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2355
2356	spin_lock_bh(&sk->sk_lock.slock);
2357	if (sk->sk_backlog.tail)
2358		__release_sock(sk);
2359
2360	if (sk->sk_prot->release_cb)
2361		sk->sk_prot->release_cb(sk);
2362
2363	sk->sk_lock.owned = 0;
2364	if (waitqueue_active(&sk->sk_lock.wq))
2365		wake_up(&sk->sk_lock.wq);
2366	spin_unlock_bh(&sk->sk_lock.slock);
2367}
2368EXPORT_SYMBOL(release_sock);
2369
2370/**
2371 * lock_sock_fast - fast version of lock_sock
2372 * @sk: socket
2373 *
2374 * This version should be used for very small section, where process wont block
2375 * return false if fast path is taken
2376 *   sk_lock.slock locked, owned = 0, BH disabled
2377 * return true if slow path is taken
2378 *   sk_lock.slock unlocked, owned = 1, BH enabled
2379 */
2380bool lock_sock_fast(struct sock *sk)
2381{
2382	might_sleep();
2383	spin_lock_bh(&sk->sk_lock.slock);
2384
2385	if (!sk->sk_lock.owned)
2386		/*
2387		 * Note : We must disable BH
2388		 */
2389		return false;
2390
2391	__lock_sock(sk);
2392	sk->sk_lock.owned = 1;
2393	spin_unlock(&sk->sk_lock.slock);
2394	/*
2395	 * The sk_lock has mutex_lock() semantics here:
2396	 */
2397	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2398	local_bh_enable();
2399	return true;
2400}
2401EXPORT_SYMBOL(lock_sock_fast);
2402
2403int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2404{
2405	struct timeval tv;
2406	if (!sock_flag(sk, SOCK_TIMESTAMP))
2407		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2408	tv = ktime_to_timeval(sk->sk_stamp);
2409	if (tv.tv_sec == -1)
2410		return -ENOENT;
2411	if (tv.tv_sec == 0) {
2412		sk->sk_stamp = ktime_get_real();
2413		tv = ktime_to_timeval(sk->sk_stamp);
2414	}
2415	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2416}
2417EXPORT_SYMBOL(sock_get_timestamp);
2418
2419int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2420{
2421	struct timespec ts;
2422	if (!sock_flag(sk, SOCK_TIMESTAMP))
2423		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2424	ts = ktime_to_timespec(sk->sk_stamp);
2425	if (ts.tv_sec == -1)
2426		return -ENOENT;
2427	if (ts.tv_sec == 0) {
2428		sk->sk_stamp = ktime_get_real();
2429		ts = ktime_to_timespec(sk->sk_stamp);
2430	}
2431	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2432}
2433EXPORT_SYMBOL(sock_get_timestampns);
2434
2435void sock_enable_timestamp(struct sock *sk, int flag)
2436{
2437	if (!sock_flag(sk, flag)) {
2438		unsigned long previous_flags = sk->sk_flags;
2439
2440		sock_set_flag(sk, flag);
2441		/*
2442		 * we just set one of the two flags which require net
2443		 * time stamping, but time stamping might have been on
2444		 * already because of the other one
2445		 */
2446		if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2447			net_enable_timestamp();
2448	}
2449}
2450
2451int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2452		       int level, int type)
2453{
2454	struct sock_exterr_skb *serr;
2455	struct sk_buff *skb, *skb2;
2456	int copied, err;
2457
2458	err = -EAGAIN;
2459	skb = skb_dequeue(&sk->sk_error_queue);
2460	if (skb == NULL)
2461		goto out;
2462
2463	copied = skb->len;
2464	if (copied > len) {
2465		msg->msg_flags |= MSG_TRUNC;
2466		copied = len;
2467	}
2468	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2469	if (err)
2470		goto out_free_skb;
2471
2472	sock_recv_timestamp(msg, sk, skb);
2473
2474	serr = SKB_EXT_ERR(skb);
2475	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2476
2477	msg->msg_flags |= MSG_ERRQUEUE;
2478	err = copied;
2479
2480	/* Reset and regenerate socket error */
2481	spin_lock_bh(&sk->sk_error_queue.lock);
2482	sk->sk_err = 0;
2483	if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2484		sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2485		spin_unlock_bh(&sk->sk_error_queue.lock);
2486		sk->sk_error_report(sk);
2487	} else
2488		spin_unlock_bh(&sk->sk_error_queue.lock);
2489
2490out_free_skb:
2491	kfree_skb(skb);
2492out:
2493	return err;
2494}
2495EXPORT_SYMBOL(sock_recv_errqueue);
2496
2497/*
2498 *	Get a socket option on an socket.
2499 *
2500 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2501 *	asynchronous errors should be reported by getsockopt. We assume
2502 *	this means if you specify SO_ERROR (otherwise whats the point of it).
2503 */
2504int sock_common_getsockopt(struct socket *sock, int level, int optname,
2505			   char __user *optval, int __user *optlen)
2506{
2507	struct sock *sk = sock->sk;
2508
2509	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2510}
2511EXPORT_SYMBOL(sock_common_getsockopt);
2512
2513#ifdef CONFIG_COMPAT
2514int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2515				  char __user *optval, int __user *optlen)
2516{
2517	struct sock *sk = sock->sk;
2518
2519	if (sk->sk_prot->compat_getsockopt != NULL)
2520		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2521						      optval, optlen);
2522	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2523}
2524EXPORT_SYMBOL(compat_sock_common_getsockopt);
2525#endif
2526
2527int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2528			struct msghdr *msg, size_t size, int flags)
2529{
2530	struct sock *sk = sock->sk;
2531	int addr_len = 0;
2532	int err;
2533
2534	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2535				   flags & ~MSG_DONTWAIT, &addr_len);
2536	if (err >= 0)
2537		msg->msg_namelen = addr_len;
2538	return err;
2539}
2540EXPORT_SYMBOL(sock_common_recvmsg);
2541
2542/*
2543 *	Set socket options on an inet socket.
2544 */
2545int sock_common_setsockopt(struct socket *sock, int level, int optname,
2546			   char __user *optval, unsigned int optlen)
2547{
2548	struct sock *sk = sock->sk;
2549
2550	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2551}
2552EXPORT_SYMBOL(sock_common_setsockopt);
2553
2554#ifdef CONFIG_COMPAT
2555int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2556				  char __user *optval, unsigned int optlen)
2557{
2558	struct sock *sk = sock->sk;
2559
2560	if (sk->sk_prot->compat_setsockopt != NULL)
2561		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2562						      optval, optlen);
2563	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2564}
2565EXPORT_SYMBOL(compat_sock_common_setsockopt);
2566#endif
2567
2568void sk_common_release(struct sock *sk)
2569{
2570	if (sk->sk_prot->destroy)
2571		sk->sk_prot->destroy(sk);
2572
2573	/*
2574	 * Observation: when sock_common_release is called, processes have
2575	 * no access to socket. But net still has.
2576	 * Step one, detach it from networking:
2577	 *
2578	 * A. Remove from hash tables.
2579	 */
2580
2581	sk->sk_prot->unhash(sk);
2582
2583	/*
2584	 * In this point socket cannot receive new packets, but it is possible
2585	 * that some packets are in flight because some CPU runs receiver and
2586	 * did hash table lookup before we unhashed socket. They will achieve
2587	 * receive queue and will be purged by socket destructor.
2588	 *
2589	 * Also we still have packets pending on receive queue and probably,
2590	 * our own packets waiting in device queues. sock_destroy will drain
2591	 * receive queue, but transmitted packets will delay socket destruction
2592	 * until the last reference will be released.
2593	 */
2594
2595	sock_orphan(sk);
2596
2597	xfrm_sk_free_policy(sk);
2598
2599	sk_refcnt_debug_release(sk);
2600
2601	if (sk->sk_frag.page) {
2602		put_page(sk->sk_frag.page);
2603		sk->sk_frag.page = NULL;
2604	}
2605
2606	sock_put(sk);
2607}
2608EXPORT_SYMBOL(sk_common_release);
2609
2610#ifdef CONFIG_PROC_FS
2611#define PROTO_INUSE_NR	64	/* should be enough for the first time */
2612struct prot_inuse {
2613	int val[PROTO_INUSE_NR];
2614};
2615
2616static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2617
2618#ifdef CONFIG_NET_NS
2619void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2620{
2621	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2622}
2623EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2624
2625int sock_prot_inuse_get(struct net *net, struct proto *prot)
2626{
2627	int cpu, idx = prot->inuse_idx;
2628	int res = 0;
2629
2630	for_each_possible_cpu(cpu)
2631		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2632
2633	return res >= 0 ? res : 0;
2634}
2635EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2636
2637static int __net_init sock_inuse_init_net(struct net *net)
2638{
2639	net->core.inuse = alloc_percpu(struct prot_inuse);
2640	return net->core.inuse ? 0 : -ENOMEM;
2641}
2642
2643static void __net_exit sock_inuse_exit_net(struct net *net)
2644{
2645	free_percpu(net->core.inuse);
2646}
2647
2648static struct pernet_operations net_inuse_ops = {
2649	.init = sock_inuse_init_net,
2650	.exit = sock_inuse_exit_net,
2651};
2652
2653static __init int net_inuse_init(void)
2654{
2655	if (register_pernet_subsys(&net_inuse_ops))
2656		panic("Cannot initialize net inuse counters");
2657
2658	return 0;
2659}
2660
2661core_initcall(net_inuse_init);
2662#else
2663static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2664
2665void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2666{
2667	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2668}
2669EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2670
2671int sock_prot_inuse_get(struct net *net, struct proto *prot)
2672{
2673	int cpu, idx = prot->inuse_idx;
2674	int res = 0;
2675
2676	for_each_possible_cpu(cpu)
2677		res += per_cpu(prot_inuse, cpu).val[idx];
2678
2679	return res >= 0 ? res : 0;
2680}
2681EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2682#endif
2683
2684static void assign_proto_idx(struct proto *prot)
2685{
2686	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2687
2688	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2689		pr_err("PROTO_INUSE_NR exhausted\n");
2690		return;
2691	}
2692
2693	set_bit(prot->inuse_idx, proto_inuse_idx);
2694}
2695
2696static void release_proto_idx(struct proto *prot)
2697{
2698	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2699		clear_bit(prot->inuse_idx, proto_inuse_idx);
2700}
2701#else
2702static inline void assign_proto_idx(struct proto *prot)
2703{
2704}
2705
2706static inline void release_proto_idx(struct proto *prot)
2707{
2708}
2709#endif
2710
2711int proto_register(struct proto *prot, int alloc_slab)
2712{
2713	if (alloc_slab) {
2714		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2715					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2716					NULL);
2717
2718		if (prot->slab == NULL) {
2719			pr_crit("%s: Can't create sock SLAB cache!\n",
2720				prot->name);
2721			goto out;
2722		}
2723
2724		if (prot->rsk_prot != NULL) {
2725			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2726			if (prot->rsk_prot->slab_name == NULL)
2727				goto out_free_sock_slab;
2728
2729			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2730								 prot->rsk_prot->obj_size, 0,
2731								 SLAB_HWCACHE_ALIGN, NULL);
2732
2733			if (prot->rsk_prot->slab == NULL) {
2734				pr_crit("%s: Can't create request sock SLAB cache!\n",
2735					prot->name);
2736				goto out_free_request_sock_slab_name;
2737			}
2738		}
2739
2740		if (prot->twsk_prot != NULL) {
2741			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2742
2743			if (prot->twsk_prot->twsk_slab_name == NULL)
2744				goto out_free_request_sock_slab;
2745
2746			prot->twsk_prot->twsk_slab =
2747				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2748						  prot->twsk_prot->twsk_obj_size,
2749						  0,
2750						  SLAB_HWCACHE_ALIGN |
2751							prot->slab_flags,
2752						  NULL);
2753			if (prot->twsk_prot->twsk_slab == NULL)
2754				goto out_free_timewait_sock_slab_name;
2755		}
2756	}
2757
2758	mutex_lock(&proto_list_mutex);
2759	list_add(&prot->node, &proto_list);
2760	assign_proto_idx(prot);
2761	mutex_unlock(&proto_list_mutex);
2762	return 0;
2763
2764out_free_timewait_sock_slab_name:
2765	kfree(prot->twsk_prot->twsk_slab_name);
2766out_free_request_sock_slab:
2767	if (prot->rsk_prot && prot->rsk_prot->slab) {
2768		kmem_cache_destroy(prot->rsk_prot->slab);
2769		prot->rsk_prot->slab = NULL;
2770	}
2771out_free_request_sock_slab_name:
2772	if (prot->rsk_prot)
2773		kfree(prot->rsk_prot->slab_name);
2774out_free_sock_slab:
2775	kmem_cache_destroy(prot->slab);
2776	prot->slab = NULL;
2777out:
2778	return -ENOBUFS;
2779}
2780EXPORT_SYMBOL(proto_register);
2781
2782void proto_unregister(struct proto *prot)
2783{
2784	mutex_lock(&proto_list_mutex);
2785	release_proto_idx(prot);
2786	list_del(&prot->node);
2787	mutex_unlock(&proto_list_mutex);
2788
2789	if (prot->slab != NULL) {
2790		kmem_cache_destroy(prot->slab);
2791		prot->slab = NULL;
2792	}
2793
2794	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2795		kmem_cache_destroy(prot->rsk_prot->slab);
2796		kfree(prot->rsk_prot->slab_name);
2797		prot->rsk_prot->slab = NULL;
2798	}
2799
2800	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2801		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2802		kfree(prot->twsk_prot->twsk_slab_name);
2803		prot->twsk_prot->twsk_slab = NULL;
2804	}
2805}
2806EXPORT_SYMBOL(proto_unregister);
2807
2808#ifdef CONFIG_PROC_FS
2809static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2810	__acquires(proto_list_mutex)
2811{
2812	mutex_lock(&proto_list_mutex);
2813	return seq_list_start_head(&proto_list, *pos);
2814}
2815
2816static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2817{
2818	return seq_list_next(v, &proto_list, pos);
2819}
2820
2821static void proto_seq_stop(struct seq_file *seq, void *v)
2822	__releases(proto_list_mutex)
2823{
2824	mutex_unlock(&proto_list_mutex);
2825}
2826
2827static char proto_method_implemented(const void *method)
2828{
2829	return method == NULL ? 'n' : 'y';
2830}
2831static long sock_prot_memory_allocated(struct proto *proto)
2832{
2833	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2834}
2835
2836static char *sock_prot_memory_pressure(struct proto *proto)
2837{
2838	return proto->memory_pressure != NULL ?
2839	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2840}
2841
2842static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2843{
2844
2845	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2846			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2847		   proto->name,
2848		   proto->obj_size,
2849		   sock_prot_inuse_get(seq_file_net(seq), proto),
2850		   sock_prot_memory_allocated(proto),
2851		   sock_prot_memory_pressure(proto),
2852		   proto->max_header,
2853		   proto->slab == NULL ? "no" : "yes",
2854		   module_name(proto->owner),
2855		   proto_method_implemented(proto->close),
2856		   proto_method_implemented(proto->connect),
2857		   proto_method_implemented(proto->disconnect),
2858		   proto_method_implemented(proto->accept),
2859		   proto_method_implemented(proto->ioctl),
2860		   proto_method_implemented(proto->init),
2861		   proto_method_implemented(proto->destroy),
2862		   proto_method_implemented(proto->shutdown),
2863		   proto_method_implemented(proto->setsockopt),
2864		   proto_method_implemented(proto->getsockopt),
2865		   proto_method_implemented(proto->sendmsg),
2866		   proto_method_implemented(proto->recvmsg),
2867		   proto_method_implemented(proto->sendpage),
2868		   proto_method_implemented(proto->bind),
2869		   proto_method_implemented(proto->backlog_rcv),
2870		   proto_method_implemented(proto->hash),
2871		   proto_method_implemented(proto->unhash),
2872		   proto_method_implemented(proto->get_port),
2873		   proto_method_implemented(proto->enter_memory_pressure));
2874}
2875
2876static int proto_seq_show(struct seq_file *seq, void *v)
2877{
2878	if (v == &proto_list)
2879		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2880			   "protocol",
2881			   "size",
2882			   "sockets",
2883			   "memory",
2884			   "press",
2885			   "maxhdr",
2886			   "slab",
2887			   "module",
2888			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2889	else
2890		proto_seq_printf(seq, list_entry(v, struct proto, node));
2891	return 0;
2892}
2893
2894static const struct seq_operations proto_seq_ops = {
2895	.start  = proto_seq_start,
2896	.next   = proto_seq_next,
2897	.stop   = proto_seq_stop,
2898	.show   = proto_seq_show,
2899};
2900
2901static int proto_seq_open(struct inode *inode, struct file *file)
2902{
2903	return seq_open_net(inode, file, &proto_seq_ops,
2904			    sizeof(struct seq_net_private));
2905}
2906
2907static const struct file_operations proto_seq_fops = {
2908	.owner		= THIS_MODULE,
2909	.open		= proto_seq_open,
2910	.read		= seq_read,
2911	.llseek		= seq_lseek,
2912	.release	= seq_release_net,
2913};
2914
2915static __net_init int proto_init_net(struct net *net)
2916{
2917	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2918		return -ENOMEM;
2919
2920	return 0;
2921}
2922
2923static __net_exit void proto_exit_net(struct net *net)
2924{
2925	remove_proc_entry("protocols", net->proc_net);
2926}
2927
2928
2929static __net_initdata struct pernet_operations proto_net_ops = {
2930	.init = proto_init_net,
2931	.exit = proto_exit_net,
2932};
2933
2934static int __init proto_init(void)
2935{
2936	return register_pernet_subsys(&proto_net_ops);
2937}
2938
2939subsys_initcall(proto_init);
2940
2941#endif /* PROC_FS */
2942