sock.c revision 884cf705c7e60bc6ade7ddafcbe943af4dc84604
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 *		Alan Cox	: 	Numerous verify_area() problems
17 *		Alan Cox	:	Connecting on a connecting socket
18 *					now returns an error for tcp.
19 *		Alan Cox	:	sock->protocol is set correctly.
20 *					and is not sometimes left as 0.
21 *		Alan Cox	:	connect handles icmp errors on a
22 *					connect properly. Unfortunately there
23 *					is a restart syscall nasty there. I
24 *					can't match BSD without hacking the C
25 *					library. Ideas urgently sought!
26 *		Alan Cox	:	Disallow bind() to addresses that are
27 *					not ours - especially broadcast ones!!
28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30 *					instead they leave that for the DESTROY timer.
31 *		Alan Cox	:	Clean up error flag in accept
32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33 *					was buggy. Put a remove_sock() in the handler
34 *					for memory when we hit 0. Also altered the timer
35 *					code. The ACK stuff can wait and needs major
36 *					TCP layer surgery.
37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38 *					and fixed timer/inet_bh race.
39 *		Alan Cox	:	Added zapped flag for TCP
40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47 *	Pauline Middelink	:	identd support
48 *		Alan Cox	:	Fixed connect() taking signals I think.
49 *		Alan Cox	:	SO_LINGER supported
50 *		Alan Cox	:	Error reporting fixes
51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52 *		Alan Cox	:	inet sockets don't set sk->type!
53 *		Alan Cox	:	Split socket option code
54 *		Alan Cox	:	Callbacks
55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56 *		Alex		:	Removed restriction on inet fioctl
57 *		Alan Cox	:	Splitting INET from NET core
58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60 *		Alan Cox	:	Split IP from generic code
61 *		Alan Cox	:	New kfree_skbmem()
62 *		Alan Cox	:	Make SO_DEBUG superuser only.
63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64 *					(compatibility fix)
65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66 *		Alan Cox	:	Allocator for a socket is settable.
67 *		Alan Cox	:	SO_ERROR includes soft errors.
68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69 *		Alan Cox	: 	Generic socket allocation to make hooks
70 *					easier (suggested by Craig Metz).
71 *		Michael Pall	:	SO_ERROR returns positive errno again
72 *              Steve Whitehouse:       Added default destructor to free
73 *                                      protocol private data.
74 *              Steve Whitehouse:       Added various other default routines
75 *                                      common to several socket families.
76 *              Chris Evans     :       Call suser() check last on F_SETOWN
77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79 *		Andi Kleen	:	Fix write_space callback
80 *		Chris Evans	:	Security fixes - signedness again
81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 *		This program is free software; you can redistribute it and/or
87 *		modify it under the terms of the GNU General Public License
88 *		as published by the Free Software Foundation; either version
89 *		2 of the License, or (at your option) any later version.
90 */
91
92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
94#include <linux/capability.h>
95#include <linux/errno.h>
96#include <linux/errqueue.h>
97#include <linux/types.h>
98#include <linux/socket.h>
99#include <linux/in.h>
100#include <linux/kernel.h>
101#include <linux/module.h>
102#include <linux/proc_fs.h>
103#include <linux/seq_file.h>
104#include <linux/sched.h>
105#include <linux/timer.h>
106#include <linux/string.h>
107#include <linux/sockios.h>
108#include <linux/net.h>
109#include <linux/mm.h>
110#include <linux/slab.h>
111#include <linux/interrupt.h>
112#include <linux/poll.h>
113#include <linux/tcp.h>
114#include <linux/init.h>
115#include <linux/highmem.h>
116#include <linux/user_namespace.h>
117#include <linux/static_key.h>
118#include <linux/memcontrol.h>
119#include <linux/prefetch.h>
120
121#include <asm/uaccess.h>
122
123#include <linux/netdevice.h>
124#include <net/protocol.h>
125#include <linux/skbuff.h>
126#include <net/net_namespace.h>
127#include <net/request_sock.h>
128#include <net/sock.h>
129#include <linux/net_tstamp.h>
130#include <net/xfrm.h>
131#include <linux/ipsec.h>
132#include <net/cls_cgroup.h>
133#include <net/netprio_cgroup.h>
134
135#include <linux/filter.h>
136
137#include <trace/events/sock.h>
138
139#ifdef CONFIG_INET
140#include <net/tcp.h>
141#endif
142
143#include <net/busy_poll.h>
144
145static DEFINE_MUTEX(proto_list_mutex);
146static LIST_HEAD(proto_list);
147
148/**
149 * sk_ns_capable - General socket capability test
150 * @sk: Socket to use a capability on or through
151 * @user_ns: The user namespace of the capability to use
152 * @cap: The capability to use
153 *
154 * Test to see if the opener of the socket had when the socket was
155 * created and the current process has the capability @cap in the user
156 * namespace @user_ns.
157 */
158bool sk_ns_capable(const struct sock *sk,
159		   struct user_namespace *user_ns, int cap)
160{
161	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
162		ns_capable(user_ns, cap);
163}
164EXPORT_SYMBOL(sk_ns_capable);
165
166/**
167 * sk_capable - Socket global capability test
168 * @sk: Socket to use a capability on or through
169 * @cap: The global capbility to use
170 *
171 * Test to see if the opener of the socket had when the socket was
172 * created and the current process has the capability @cap in all user
173 * namespaces.
174 */
175bool sk_capable(const struct sock *sk, int cap)
176{
177	return sk_ns_capable(sk, &init_user_ns, cap);
178}
179EXPORT_SYMBOL(sk_capable);
180
181/**
182 * sk_net_capable - Network namespace socket capability test
183 * @sk: Socket to use a capability on or through
184 * @cap: The capability to use
185 *
186 * Test to see if the opener of the socket had when the socke was created
187 * and the current process has the capability @cap over the network namespace
188 * the socket is a member of.
189 */
190bool sk_net_capable(const struct sock *sk, int cap)
191{
192	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
193}
194EXPORT_SYMBOL(sk_net_capable);
195
196
197#ifdef CONFIG_MEMCG_KMEM
198int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
199{
200	struct proto *proto;
201	int ret = 0;
202
203	mutex_lock(&proto_list_mutex);
204	list_for_each_entry(proto, &proto_list, node) {
205		if (proto->init_cgroup) {
206			ret = proto->init_cgroup(memcg, ss);
207			if (ret)
208				goto out;
209		}
210	}
211
212	mutex_unlock(&proto_list_mutex);
213	return ret;
214out:
215	list_for_each_entry_continue_reverse(proto, &proto_list, node)
216		if (proto->destroy_cgroup)
217			proto->destroy_cgroup(memcg);
218	mutex_unlock(&proto_list_mutex);
219	return ret;
220}
221
222void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
223{
224	struct proto *proto;
225
226	mutex_lock(&proto_list_mutex);
227	list_for_each_entry_reverse(proto, &proto_list, node)
228		if (proto->destroy_cgroup)
229			proto->destroy_cgroup(memcg);
230	mutex_unlock(&proto_list_mutex);
231}
232#endif
233
234/*
235 * Each address family might have different locking rules, so we have
236 * one slock key per address family:
237 */
238static struct lock_class_key af_family_keys[AF_MAX];
239static struct lock_class_key af_family_slock_keys[AF_MAX];
240
241#if defined(CONFIG_MEMCG_KMEM)
242struct static_key memcg_socket_limit_enabled;
243EXPORT_SYMBOL(memcg_socket_limit_enabled);
244#endif
245
246/*
247 * Make lock validator output more readable. (we pre-construct these
248 * strings build-time, so that runtime initialization of socket
249 * locks is fast):
250 */
251static const char *const af_family_key_strings[AF_MAX+1] = {
252  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
253  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
254  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
255  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
256  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
257  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
258  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
259  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
260  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
261  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
262  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
263  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
264  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
265  "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
266};
267static const char *const af_family_slock_key_strings[AF_MAX+1] = {
268  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
269  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
270  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
271  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
272  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
273  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
274  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
275  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
276  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
277  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
278  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
279  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
280  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
281  "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
282};
283static const char *const af_family_clock_key_strings[AF_MAX+1] = {
284  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
285  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
286  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
287  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
288  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
289  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
290  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
291  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
292  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
293  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
294  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
295  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
296  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
297  "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
298};
299
300/*
301 * sk_callback_lock locking rules are per-address-family,
302 * so split the lock classes by using a per-AF key:
303 */
304static struct lock_class_key af_callback_keys[AF_MAX];
305
306/* Take into consideration the size of the struct sk_buff overhead in the
307 * determination of these values, since that is non-constant across
308 * platforms.  This makes socket queueing behavior and performance
309 * not depend upon such differences.
310 */
311#define _SK_MEM_PACKETS		256
312#define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
313#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
314#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
315
316/* Run time adjustable parameters. */
317__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
318EXPORT_SYMBOL(sysctl_wmem_max);
319__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
320EXPORT_SYMBOL(sysctl_rmem_max);
321__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
322__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
323
324/* Maximal space eaten by iovec or ancillary data plus some space */
325int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
326EXPORT_SYMBOL(sysctl_optmem_max);
327
328struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
329EXPORT_SYMBOL_GPL(memalloc_socks);
330
331/**
332 * sk_set_memalloc - sets %SOCK_MEMALLOC
333 * @sk: socket to set it on
334 *
335 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
336 * It's the responsibility of the admin to adjust min_free_kbytes
337 * to meet the requirements
338 */
339void sk_set_memalloc(struct sock *sk)
340{
341	sock_set_flag(sk, SOCK_MEMALLOC);
342	sk->sk_allocation |= __GFP_MEMALLOC;
343	static_key_slow_inc(&memalloc_socks);
344}
345EXPORT_SYMBOL_GPL(sk_set_memalloc);
346
347void sk_clear_memalloc(struct sock *sk)
348{
349	sock_reset_flag(sk, SOCK_MEMALLOC);
350	sk->sk_allocation &= ~__GFP_MEMALLOC;
351	static_key_slow_dec(&memalloc_socks);
352
353	/*
354	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
355	 * progress of swapping. However, if SOCK_MEMALLOC is cleared while
356	 * it has rmem allocations there is a risk that the user of the
357	 * socket cannot make forward progress due to exceeding the rmem
358	 * limits. By rights, sk_clear_memalloc() should only be called
359	 * on sockets being torn down but warn and reset the accounting if
360	 * that assumption breaks.
361	 */
362	if (WARN_ON(sk->sk_forward_alloc))
363		sk_mem_reclaim(sk);
364}
365EXPORT_SYMBOL_GPL(sk_clear_memalloc);
366
367int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
368{
369	int ret;
370	unsigned long pflags = current->flags;
371
372	/* these should have been dropped before queueing */
373	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
374
375	current->flags |= PF_MEMALLOC;
376	ret = sk->sk_backlog_rcv(sk, skb);
377	tsk_restore_flags(current, pflags, PF_MEMALLOC);
378
379	return ret;
380}
381EXPORT_SYMBOL(__sk_backlog_rcv);
382
383static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
384{
385	struct timeval tv;
386
387	if (optlen < sizeof(tv))
388		return -EINVAL;
389	if (copy_from_user(&tv, optval, sizeof(tv)))
390		return -EFAULT;
391	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
392		return -EDOM;
393
394	if (tv.tv_sec < 0) {
395		static int warned __read_mostly;
396
397		*timeo_p = 0;
398		if (warned < 10 && net_ratelimit()) {
399			warned++;
400			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
401				__func__, current->comm, task_pid_nr(current));
402		}
403		return 0;
404	}
405	*timeo_p = MAX_SCHEDULE_TIMEOUT;
406	if (tv.tv_sec == 0 && tv.tv_usec == 0)
407		return 0;
408	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
409		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
410	return 0;
411}
412
413static void sock_warn_obsolete_bsdism(const char *name)
414{
415	static int warned;
416	static char warncomm[TASK_COMM_LEN];
417	if (strcmp(warncomm, current->comm) && warned < 5) {
418		strcpy(warncomm,  current->comm);
419		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
420			warncomm, name);
421		warned++;
422	}
423}
424
425#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
426
427static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
428{
429	if (sk->sk_flags & flags) {
430		sk->sk_flags &= ~flags;
431		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
432			net_disable_timestamp();
433	}
434}
435
436
437int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
438{
439	int err;
440	unsigned long flags;
441	struct sk_buff_head *list = &sk->sk_receive_queue;
442
443	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
444		atomic_inc(&sk->sk_drops);
445		trace_sock_rcvqueue_full(sk, skb);
446		return -ENOMEM;
447	}
448
449	err = sk_filter(sk, skb);
450	if (err)
451		return err;
452
453	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
454		atomic_inc(&sk->sk_drops);
455		return -ENOBUFS;
456	}
457
458	skb->dev = NULL;
459	skb_set_owner_r(skb, sk);
460
461	/* we escape from rcu protected region, make sure we dont leak
462	 * a norefcounted dst
463	 */
464	skb_dst_force(skb);
465
466	spin_lock_irqsave(&list->lock, flags);
467	skb->dropcount = atomic_read(&sk->sk_drops);
468	__skb_queue_tail(list, skb);
469	spin_unlock_irqrestore(&list->lock, flags);
470
471	if (!sock_flag(sk, SOCK_DEAD))
472		sk->sk_data_ready(sk);
473	return 0;
474}
475EXPORT_SYMBOL(sock_queue_rcv_skb);
476
477int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
478{
479	int rc = NET_RX_SUCCESS;
480
481	if (sk_filter(sk, skb))
482		goto discard_and_relse;
483
484	skb->dev = NULL;
485
486	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
487		atomic_inc(&sk->sk_drops);
488		goto discard_and_relse;
489	}
490	if (nested)
491		bh_lock_sock_nested(sk);
492	else
493		bh_lock_sock(sk);
494	if (!sock_owned_by_user(sk)) {
495		/*
496		 * trylock + unlock semantics:
497		 */
498		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
499
500		rc = sk_backlog_rcv(sk, skb);
501
502		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
503	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
504		bh_unlock_sock(sk);
505		atomic_inc(&sk->sk_drops);
506		goto discard_and_relse;
507	}
508
509	bh_unlock_sock(sk);
510out:
511	sock_put(sk);
512	return rc;
513discard_and_relse:
514	kfree_skb(skb);
515	goto out;
516}
517EXPORT_SYMBOL(sk_receive_skb);
518
519struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
520{
521	struct dst_entry *dst = __sk_dst_get(sk);
522
523	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
524		sk_tx_queue_clear(sk);
525		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
526		dst_release(dst);
527		return NULL;
528	}
529
530	return dst;
531}
532EXPORT_SYMBOL(__sk_dst_check);
533
534struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
535{
536	struct dst_entry *dst = sk_dst_get(sk);
537
538	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
539		sk_dst_reset(sk);
540		dst_release(dst);
541		return NULL;
542	}
543
544	return dst;
545}
546EXPORT_SYMBOL(sk_dst_check);
547
548static int sock_setbindtodevice(struct sock *sk, char __user *optval,
549				int optlen)
550{
551	int ret = -ENOPROTOOPT;
552#ifdef CONFIG_NETDEVICES
553	struct net *net = sock_net(sk);
554	char devname[IFNAMSIZ];
555	int index;
556
557	/* Sorry... */
558	ret = -EPERM;
559	if (!ns_capable(net->user_ns, CAP_NET_RAW))
560		goto out;
561
562	ret = -EINVAL;
563	if (optlen < 0)
564		goto out;
565
566	/* Bind this socket to a particular device like "eth0",
567	 * as specified in the passed interface name. If the
568	 * name is "" or the option length is zero the socket
569	 * is not bound.
570	 */
571	if (optlen > IFNAMSIZ - 1)
572		optlen = IFNAMSIZ - 1;
573	memset(devname, 0, sizeof(devname));
574
575	ret = -EFAULT;
576	if (copy_from_user(devname, optval, optlen))
577		goto out;
578
579	index = 0;
580	if (devname[0] != '\0') {
581		struct net_device *dev;
582
583		rcu_read_lock();
584		dev = dev_get_by_name_rcu(net, devname);
585		if (dev)
586			index = dev->ifindex;
587		rcu_read_unlock();
588		ret = -ENODEV;
589		if (!dev)
590			goto out;
591	}
592
593	lock_sock(sk);
594	sk->sk_bound_dev_if = index;
595	sk_dst_reset(sk);
596	release_sock(sk);
597
598	ret = 0;
599
600out:
601#endif
602
603	return ret;
604}
605
606static int sock_getbindtodevice(struct sock *sk, char __user *optval,
607				int __user *optlen, int len)
608{
609	int ret = -ENOPROTOOPT;
610#ifdef CONFIG_NETDEVICES
611	struct net *net = sock_net(sk);
612	char devname[IFNAMSIZ];
613
614	if (sk->sk_bound_dev_if == 0) {
615		len = 0;
616		goto zero;
617	}
618
619	ret = -EINVAL;
620	if (len < IFNAMSIZ)
621		goto out;
622
623	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
624	if (ret)
625		goto out;
626
627	len = strlen(devname) + 1;
628
629	ret = -EFAULT;
630	if (copy_to_user(optval, devname, len))
631		goto out;
632
633zero:
634	ret = -EFAULT;
635	if (put_user(len, optlen))
636		goto out;
637
638	ret = 0;
639
640out:
641#endif
642
643	return ret;
644}
645
646static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
647{
648	if (valbool)
649		sock_set_flag(sk, bit);
650	else
651		sock_reset_flag(sk, bit);
652}
653
654/*
655 *	This is meant for all protocols to use and covers goings on
656 *	at the socket level. Everything here is generic.
657 */
658
659int sock_setsockopt(struct socket *sock, int level, int optname,
660		    char __user *optval, unsigned int optlen)
661{
662	struct sock *sk = sock->sk;
663	int val;
664	int valbool;
665	struct linger ling;
666	int ret = 0;
667
668	/*
669	 *	Options without arguments
670	 */
671
672	if (optname == SO_BINDTODEVICE)
673		return sock_setbindtodevice(sk, optval, optlen);
674
675	if (optlen < sizeof(int))
676		return -EINVAL;
677
678	if (get_user(val, (int __user *)optval))
679		return -EFAULT;
680
681	valbool = val ? 1 : 0;
682
683	lock_sock(sk);
684
685	switch (optname) {
686	case SO_DEBUG:
687		if (val && !capable(CAP_NET_ADMIN))
688			ret = -EACCES;
689		else
690			sock_valbool_flag(sk, SOCK_DBG, valbool);
691		break;
692	case SO_REUSEADDR:
693		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
694		break;
695	case SO_REUSEPORT:
696		sk->sk_reuseport = valbool;
697		break;
698	case SO_TYPE:
699	case SO_PROTOCOL:
700	case SO_DOMAIN:
701	case SO_ERROR:
702		ret = -ENOPROTOOPT;
703		break;
704	case SO_DONTROUTE:
705		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
706		break;
707	case SO_BROADCAST:
708		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
709		break;
710	case SO_SNDBUF:
711		/* Don't error on this BSD doesn't and if you think
712		 * about it this is right. Otherwise apps have to
713		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
714		 * are treated in BSD as hints
715		 */
716		val = min_t(u32, val, sysctl_wmem_max);
717set_sndbuf:
718		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
719		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
720		/* Wake up sending tasks if we upped the value. */
721		sk->sk_write_space(sk);
722		break;
723
724	case SO_SNDBUFFORCE:
725		if (!capable(CAP_NET_ADMIN)) {
726			ret = -EPERM;
727			break;
728		}
729		goto set_sndbuf;
730
731	case SO_RCVBUF:
732		/* Don't error on this BSD doesn't and if you think
733		 * about it this is right. Otherwise apps have to
734		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
735		 * are treated in BSD as hints
736		 */
737		val = min_t(u32, val, sysctl_rmem_max);
738set_rcvbuf:
739		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
740		/*
741		 * We double it on the way in to account for
742		 * "struct sk_buff" etc. overhead.   Applications
743		 * assume that the SO_RCVBUF setting they make will
744		 * allow that much actual data to be received on that
745		 * socket.
746		 *
747		 * Applications are unaware that "struct sk_buff" and
748		 * other overheads allocate from the receive buffer
749		 * during socket buffer allocation.
750		 *
751		 * And after considering the possible alternatives,
752		 * returning the value we actually used in getsockopt
753		 * is the most desirable behavior.
754		 */
755		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
756		break;
757
758	case SO_RCVBUFFORCE:
759		if (!capable(CAP_NET_ADMIN)) {
760			ret = -EPERM;
761			break;
762		}
763		goto set_rcvbuf;
764
765	case SO_KEEPALIVE:
766#ifdef CONFIG_INET
767		if (sk->sk_protocol == IPPROTO_TCP &&
768		    sk->sk_type == SOCK_STREAM)
769			tcp_set_keepalive(sk, valbool);
770#endif
771		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
772		break;
773
774	case SO_OOBINLINE:
775		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
776		break;
777
778	case SO_NO_CHECK:
779		sk->sk_no_check_tx = valbool;
780		break;
781
782	case SO_PRIORITY:
783		if ((val >= 0 && val <= 6) ||
784		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
785			sk->sk_priority = val;
786		else
787			ret = -EPERM;
788		break;
789
790	case SO_LINGER:
791		if (optlen < sizeof(ling)) {
792			ret = -EINVAL;	/* 1003.1g */
793			break;
794		}
795		if (copy_from_user(&ling, optval, sizeof(ling))) {
796			ret = -EFAULT;
797			break;
798		}
799		if (!ling.l_onoff)
800			sock_reset_flag(sk, SOCK_LINGER);
801		else {
802#if (BITS_PER_LONG == 32)
803			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
804				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
805			else
806#endif
807				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
808			sock_set_flag(sk, SOCK_LINGER);
809		}
810		break;
811
812	case SO_BSDCOMPAT:
813		sock_warn_obsolete_bsdism("setsockopt");
814		break;
815
816	case SO_PASSCRED:
817		if (valbool)
818			set_bit(SOCK_PASSCRED, &sock->flags);
819		else
820			clear_bit(SOCK_PASSCRED, &sock->flags);
821		break;
822
823	case SO_TIMESTAMP:
824	case SO_TIMESTAMPNS:
825		if (valbool)  {
826			if (optname == SO_TIMESTAMP)
827				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
828			else
829				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
830			sock_set_flag(sk, SOCK_RCVTSTAMP);
831			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
832		} else {
833			sock_reset_flag(sk, SOCK_RCVTSTAMP);
834			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
835		}
836		break;
837
838	case SO_TIMESTAMPING:
839		if (val & ~SOF_TIMESTAMPING_MASK) {
840			ret = -EINVAL;
841			break;
842		}
843		if (val & SOF_TIMESTAMPING_OPT_ID &&
844		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
845			if (sk->sk_protocol == IPPROTO_TCP) {
846				if (sk->sk_state != TCP_ESTABLISHED) {
847					ret = -EINVAL;
848					break;
849				}
850				sk->sk_tskey = tcp_sk(sk)->snd_una;
851			} else {
852				sk->sk_tskey = 0;
853			}
854		}
855		sk->sk_tsflags = val;
856		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
857			sock_enable_timestamp(sk,
858					      SOCK_TIMESTAMPING_RX_SOFTWARE);
859		else
860			sock_disable_timestamp(sk,
861					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
862		break;
863
864	case SO_RCVLOWAT:
865		if (val < 0)
866			val = INT_MAX;
867		sk->sk_rcvlowat = val ? : 1;
868		break;
869
870	case SO_RCVTIMEO:
871		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
872		break;
873
874	case SO_SNDTIMEO:
875		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
876		break;
877
878	case SO_ATTACH_FILTER:
879		ret = -EINVAL;
880		if (optlen == sizeof(struct sock_fprog)) {
881			struct sock_fprog fprog;
882
883			ret = -EFAULT;
884			if (copy_from_user(&fprog, optval, sizeof(fprog)))
885				break;
886
887			ret = sk_attach_filter(&fprog, sk);
888		}
889		break;
890
891	case SO_DETACH_FILTER:
892		ret = sk_detach_filter(sk);
893		break;
894
895	case SO_LOCK_FILTER:
896		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
897			ret = -EPERM;
898		else
899			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
900		break;
901
902	case SO_PASSSEC:
903		if (valbool)
904			set_bit(SOCK_PASSSEC, &sock->flags);
905		else
906			clear_bit(SOCK_PASSSEC, &sock->flags);
907		break;
908	case SO_MARK:
909		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
910			ret = -EPERM;
911		else
912			sk->sk_mark = val;
913		break;
914
915		/* We implement the SO_SNDLOWAT etc to
916		   not be settable (1003.1g 5.3) */
917	case SO_RXQ_OVFL:
918		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
919		break;
920
921	case SO_WIFI_STATUS:
922		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
923		break;
924
925	case SO_PEEK_OFF:
926		if (sock->ops->set_peek_off)
927			ret = sock->ops->set_peek_off(sk, val);
928		else
929			ret = -EOPNOTSUPP;
930		break;
931
932	case SO_NOFCS:
933		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
934		break;
935
936	case SO_SELECT_ERR_QUEUE:
937		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
938		break;
939
940#ifdef CONFIG_NET_RX_BUSY_POLL
941	case SO_BUSY_POLL:
942		/* allow unprivileged users to decrease the value */
943		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
944			ret = -EPERM;
945		else {
946			if (val < 0)
947				ret = -EINVAL;
948			else
949				sk->sk_ll_usec = val;
950		}
951		break;
952#endif
953
954	case SO_MAX_PACING_RATE:
955		sk->sk_max_pacing_rate = val;
956		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
957					 sk->sk_max_pacing_rate);
958		break;
959
960	default:
961		ret = -ENOPROTOOPT;
962		break;
963	}
964	release_sock(sk);
965	return ret;
966}
967EXPORT_SYMBOL(sock_setsockopt);
968
969
970static void cred_to_ucred(struct pid *pid, const struct cred *cred,
971			  struct ucred *ucred)
972{
973	ucred->pid = pid_vnr(pid);
974	ucred->uid = ucred->gid = -1;
975	if (cred) {
976		struct user_namespace *current_ns = current_user_ns();
977
978		ucred->uid = from_kuid_munged(current_ns, cred->euid);
979		ucred->gid = from_kgid_munged(current_ns, cred->egid);
980	}
981}
982
983int sock_getsockopt(struct socket *sock, int level, int optname,
984		    char __user *optval, int __user *optlen)
985{
986	struct sock *sk = sock->sk;
987
988	union {
989		int val;
990		struct linger ling;
991		struct timeval tm;
992	} v;
993
994	int lv = sizeof(int);
995	int len;
996
997	if (get_user(len, optlen))
998		return -EFAULT;
999	if (len < 0)
1000		return -EINVAL;
1001
1002	memset(&v, 0, sizeof(v));
1003
1004	switch (optname) {
1005	case SO_DEBUG:
1006		v.val = sock_flag(sk, SOCK_DBG);
1007		break;
1008
1009	case SO_DONTROUTE:
1010		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1011		break;
1012
1013	case SO_BROADCAST:
1014		v.val = sock_flag(sk, SOCK_BROADCAST);
1015		break;
1016
1017	case SO_SNDBUF:
1018		v.val = sk->sk_sndbuf;
1019		break;
1020
1021	case SO_RCVBUF:
1022		v.val = sk->sk_rcvbuf;
1023		break;
1024
1025	case SO_REUSEADDR:
1026		v.val = sk->sk_reuse;
1027		break;
1028
1029	case SO_REUSEPORT:
1030		v.val = sk->sk_reuseport;
1031		break;
1032
1033	case SO_KEEPALIVE:
1034		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1035		break;
1036
1037	case SO_TYPE:
1038		v.val = sk->sk_type;
1039		break;
1040
1041	case SO_PROTOCOL:
1042		v.val = sk->sk_protocol;
1043		break;
1044
1045	case SO_DOMAIN:
1046		v.val = sk->sk_family;
1047		break;
1048
1049	case SO_ERROR:
1050		v.val = -sock_error(sk);
1051		if (v.val == 0)
1052			v.val = xchg(&sk->sk_err_soft, 0);
1053		break;
1054
1055	case SO_OOBINLINE:
1056		v.val = sock_flag(sk, SOCK_URGINLINE);
1057		break;
1058
1059	case SO_NO_CHECK:
1060		v.val = sk->sk_no_check_tx;
1061		break;
1062
1063	case SO_PRIORITY:
1064		v.val = sk->sk_priority;
1065		break;
1066
1067	case SO_LINGER:
1068		lv		= sizeof(v.ling);
1069		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1070		v.ling.l_linger	= sk->sk_lingertime / HZ;
1071		break;
1072
1073	case SO_BSDCOMPAT:
1074		sock_warn_obsolete_bsdism("getsockopt");
1075		break;
1076
1077	case SO_TIMESTAMP:
1078		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1079				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1080		break;
1081
1082	case SO_TIMESTAMPNS:
1083		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1084		break;
1085
1086	case SO_TIMESTAMPING:
1087		v.val = sk->sk_tsflags;
1088		break;
1089
1090	case SO_RCVTIMEO:
1091		lv = sizeof(struct timeval);
1092		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1093			v.tm.tv_sec = 0;
1094			v.tm.tv_usec = 0;
1095		} else {
1096			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1097			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1098		}
1099		break;
1100
1101	case SO_SNDTIMEO:
1102		lv = sizeof(struct timeval);
1103		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1104			v.tm.tv_sec = 0;
1105			v.tm.tv_usec = 0;
1106		} else {
1107			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1108			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1109		}
1110		break;
1111
1112	case SO_RCVLOWAT:
1113		v.val = sk->sk_rcvlowat;
1114		break;
1115
1116	case SO_SNDLOWAT:
1117		v.val = 1;
1118		break;
1119
1120	case SO_PASSCRED:
1121		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1122		break;
1123
1124	case SO_PEERCRED:
1125	{
1126		struct ucred peercred;
1127		if (len > sizeof(peercred))
1128			len = sizeof(peercred);
1129		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1130		if (copy_to_user(optval, &peercred, len))
1131			return -EFAULT;
1132		goto lenout;
1133	}
1134
1135	case SO_PEERNAME:
1136	{
1137		char address[128];
1138
1139		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1140			return -ENOTCONN;
1141		if (lv < len)
1142			return -EINVAL;
1143		if (copy_to_user(optval, address, len))
1144			return -EFAULT;
1145		goto lenout;
1146	}
1147
1148	/* Dubious BSD thing... Probably nobody even uses it, but
1149	 * the UNIX standard wants it for whatever reason... -DaveM
1150	 */
1151	case SO_ACCEPTCONN:
1152		v.val = sk->sk_state == TCP_LISTEN;
1153		break;
1154
1155	case SO_PASSSEC:
1156		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1157		break;
1158
1159	case SO_PEERSEC:
1160		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1161
1162	case SO_MARK:
1163		v.val = sk->sk_mark;
1164		break;
1165
1166	case SO_RXQ_OVFL:
1167		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1168		break;
1169
1170	case SO_WIFI_STATUS:
1171		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1172		break;
1173
1174	case SO_PEEK_OFF:
1175		if (!sock->ops->set_peek_off)
1176			return -EOPNOTSUPP;
1177
1178		v.val = sk->sk_peek_off;
1179		break;
1180	case SO_NOFCS:
1181		v.val = sock_flag(sk, SOCK_NOFCS);
1182		break;
1183
1184	case SO_BINDTODEVICE:
1185		return sock_getbindtodevice(sk, optval, optlen, len);
1186
1187	case SO_GET_FILTER:
1188		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1189		if (len < 0)
1190			return len;
1191
1192		goto lenout;
1193
1194	case SO_LOCK_FILTER:
1195		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1196		break;
1197
1198	case SO_BPF_EXTENSIONS:
1199		v.val = bpf_tell_extensions();
1200		break;
1201
1202	case SO_SELECT_ERR_QUEUE:
1203		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1204		break;
1205
1206#ifdef CONFIG_NET_RX_BUSY_POLL
1207	case SO_BUSY_POLL:
1208		v.val = sk->sk_ll_usec;
1209		break;
1210#endif
1211
1212	case SO_MAX_PACING_RATE:
1213		v.val = sk->sk_max_pacing_rate;
1214		break;
1215
1216	default:
1217		return -ENOPROTOOPT;
1218	}
1219
1220	if (len > lv)
1221		len = lv;
1222	if (copy_to_user(optval, &v, len))
1223		return -EFAULT;
1224lenout:
1225	if (put_user(len, optlen))
1226		return -EFAULT;
1227	return 0;
1228}
1229
1230/*
1231 * Initialize an sk_lock.
1232 *
1233 * (We also register the sk_lock with the lock validator.)
1234 */
1235static inline void sock_lock_init(struct sock *sk)
1236{
1237	sock_lock_init_class_and_name(sk,
1238			af_family_slock_key_strings[sk->sk_family],
1239			af_family_slock_keys + sk->sk_family,
1240			af_family_key_strings[sk->sk_family],
1241			af_family_keys + sk->sk_family);
1242}
1243
1244/*
1245 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1246 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1247 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1248 */
1249static void sock_copy(struct sock *nsk, const struct sock *osk)
1250{
1251#ifdef CONFIG_SECURITY_NETWORK
1252	void *sptr = nsk->sk_security;
1253#endif
1254	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1255
1256	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1257	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1258
1259#ifdef CONFIG_SECURITY_NETWORK
1260	nsk->sk_security = sptr;
1261	security_sk_clone(osk, nsk);
1262#endif
1263}
1264
1265void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1266{
1267	unsigned long nulls1, nulls2;
1268
1269	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1270	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1271	if (nulls1 > nulls2)
1272		swap(nulls1, nulls2);
1273
1274	if (nulls1 != 0)
1275		memset((char *)sk, 0, nulls1);
1276	memset((char *)sk + nulls1 + sizeof(void *), 0,
1277	       nulls2 - nulls1 - sizeof(void *));
1278	memset((char *)sk + nulls2 + sizeof(void *), 0,
1279	       size - nulls2 - sizeof(void *));
1280}
1281EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1282
1283static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1284		int family)
1285{
1286	struct sock *sk;
1287	struct kmem_cache *slab;
1288
1289	slab = prot->slab;
1290	if (slab != NULL) {
1291		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1292		if (!sk)
1293			return sk;
1294		if (priority & __GFP_ZERO) {
1295			if (prot->clear_sk)
1296				prot->clear_sk(sk, prot->obj_size);
1297			else
1298				sk_prot_clear_nulls(sk, prot->obj_size);
1299		}
1300	} else
1301		sk = kmalloc(prot->obj_size, priority);
1302
1303	if (sk != NULL) {
1304		kmemcheck_annotate_bitfield(sk, flags);
1305
1306		if (security_sk_alloc(sk, family, priority))
1307			goto out_free;
1308
1309		if (!try_module_get(prot->owner))
1310			goto out_free_sec;
1311		sk_tx_queue_clear(sk);
1312	}
1313
1314	return sk;
1315
1316out_free_sec:
1317	security_sk_free(sk);
1318out_free:
1319	if (slab != NULL)
1320		kmem_cache_free(slab, sk);
1321	else
1322		kfree(sk);
1323	return NULL;
1324}
1325
1326static void sk_prot_free(struct proto *prot, struct sock *sk)
1327{
1328	struct kmem_cache *slab;
1329	struct module *owner;
1330
1331	owner = prot->owner;
1332	slab = prot->slab;
1333
1334	security_sk_free(sk);
1335	if (slab != NULL)
1336		kmem_cache_free(slab, sk);
1337	else
1338		kfree(sk);
1339	module_put(owner);
1340}
1341
1342#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
1343void sock_update_netprioidx(struct sock *sk)
1344{
1345	if (in_interrupt())
1346		return;
1347
1348	sk->sk_cgrp_prioidx = task_netprioidx(current);
1349}
1350EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1351#endif
1352
1353/**
1354 *	sk_alloc - All socket objects are allocated here
1355 *	@net: the applicable net namespace
1356 *	@family: protocol family
1357 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1358 *	@prot: struct proto associated with this new sock instance
1359 */
1360struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1361		      struct proto *prot)
1362{
1363	struct sock *sk;
1364
1365	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1366	if (sk) {
1367		sk->sk_family = family;
1368		/*
1369		 * See comment in struct sock definition to understand
1370		 * why we need sk_prot_creator -acme
1371		 */
1372		sk->sk_prot = sk->sk_prot_creator = prot;
1373		sock_lock_init(sk);
1374		sock_net_set(sk, get_net(net));
1375		atomic_set(&sk->sk_wmem_alloc, 1);
1376
1377		sock_update_classid(sk);
1378		sock_update_netprioidx(sk);
1379	}
1380
1381	return sk;
1382}
1383EXPORT_SYMBOL(sk_alloc);
1384
1385static void __sk_free(struct sock *sk)
1386{
1387	struct sk_filter *filter;
1388
1389	if (sk->sk_destruct)
1390		sk->sk_destruct(sk);
1391
1392	filter = rcu_dereference_check(sk->sk_filter,
1393				       atomic_read(&sk->sk_wmem_alloc) == 0);
1394	if (filter) {
1395		sk_filter_uncharge(sk, filter);
1396		RCU_INIT_POINTER(sk->sk_filter, NULL);
1397	}
1398
1399	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1400
1401	if (atomic_read(&sk->sk_omem_alloc))
1402		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1403			 __func__, atomic_read(&sk->sk_omem_alloc));
1404
1405	if (sk->sk_peer_cred)
1406		put_cred(sk->sk_peer_cred);
1407	put_pid(sk->sk_peer_pid);
1408	put_net(sock_net(sk));
1409	sk_prot_free(sk->sk_prot_creator, sk);
1410}
1411
1412void sk_free(struct sock *sk)
1413{
1414	/*
1415	 * We subtract one from sk_wmem_alloc and can know if
1416	 * some packets are still in some tx queue.
1417	 * If not null, sock_wfree() will call __sk_free(sk) later
1418	 */
1419	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1420		__sk_free(sk);
1421}
1422EXPORT_SYMBOL(sk_free);
1423
1424/*
1425 * Last sock_put should drop reference to sk->sk_net. It has already
1426 * been dropped in sk_change_net. Taking reference to stopping namespace
1427 * is not an option.
1428 * Take reference to a socket to remove it from hash _alive_ and after that
1429 * destroy it in the context of init_net.
1430 */
1431void sk_release_kernel(struct sock *sk)
1432{
1433	if (sk == NULL || sk->sk_socket == NULL)
1434		return;
1435
1436	sock_hold(sk);
1437	sock_release(sk->sk_socket);
1438	release_net(sock_net(sk));
1439	sock_net_set(sk, get_net(&init_net));
1440	sock_put(sk);
1441}
1442EXPORT_SYMBOL(sk_release_kernel);
1443
1444static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1445{
1446	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1447		sock_update_memcg(newsk);
1448}
1449
1450/**
1451 *	sk_clone_lock - clone a socket, and lock its clone
1452 *	@sk: the socket to clone
1453 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1454 *
1455 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1456 */
1457struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1458{
1459	struct sock *newsk;
1460	bool is_charged = true;
1461
1462	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1463	if (newsk != NULL) {
1464		struct sk_filter *filter;
1465
1466		sock_copy(newsk, sk);
1467
1468		/* SANITY */
1469		get_net(sock_net(newsk));
1470		sk_node_init(&newsk->sk_node);
1471		sock_lock_init(newsk);
1472		bh_lock_sock(newsk);
1473		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1474		newsk->sk_backlog.len = 0;
1475
1476		atomic_set(&newsk->sk_rmem_alloc, 0);
1477		/*
1478		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1479		 */
1480		atomic_set(&newsk->sk_wmem_alloc, 1);
1481		atomic_set(&newsk->sk_omem_alloc, 0);
1482		skb_queue_head_init(&newsk->sk_receive_queue);
1483		skb_queue_head_init(&newsk->sk_write_queue);
1484#ifdef CONFIG_NET_DMA
1485		skb_queue_head_init(&newsk->sk_async_wait_queue);
1486#endif
1487
1488		spin_lock_init(&newsk->sk_dst_lock);
1489		rwlock_init(&newsk->sk_callback_lock);
1490		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1491				af_callback_keys + newsk->sk_family,
1492				af_family_clock_key_strings[newsk->sk_family]);
1493
1494		newsk->sk_dst_cache	= NULL;
1495		newsk->sk_wmem_queued	= 0;
1496		newsk->sk_forward_alloc = 0;
1497		newsk->sk_send_head	= NULL;
1498		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1499
1500		sock_reset_flag(newsk, SOCK_DONE);
1501		skb_queue_head_init(&newsk->sk_error_queue);
1502
1503		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1504		if (filter != NULL)
1505			/* though it's an empty new sock, the charging may fail
1506			 * if sysctl_optmem_max was changed between creation of
1507			 * original socket and cloning
1508			 */
1509			is_charged = sk_filter_charge(newsk, filter);
1510
1511		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) {
1512			/* It is still raw copy of parent, so invalidate
1513			 * destructor and make plain sk_free() */
1514			newsk->sk_destruct = NULL;
1515			bh_unlock_sock(newsk);
1516			sk_free(newsk);
1517			newsk = NULL;
1518			goto out;
1519		}
1520
1521		newsk->sk_err	   = 0;
1522		newsk->sk_priority = 0;
1523		/*
1524		 * Before updating sk_refcnt, we must commit prior changes to memory
1525		 * (Documentation/RCU/rculist_nulls.txt for details)
1526		 */
1527		smp_wmb();
1528		atomic_set(&newsk->sk_refcnt, 2);
1529
1530		/*
1531		 * Increment the counter in the same struct proto as the master
1532		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1533		 * is the same as sk->sk_prot->socks, as this field was copied
1534		 * with memcpy).
1535		 *
1536		 * This _changes_ the previous behaviour, where
1537		 * tcp_create_openreq_child always was incrementing the
1538		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1539		 * to be taken into account in all callers. -acme
1540		 */
1541		sk_refcnt_debug_inc(newsk);
1542		sk_set_socket(newsk, NULL);
1543		newsk->sk_wq = NULL;
1544
1545		sk_update_clone(sk, newsk);
1546
1547		if (newsk->sk_prot->sockets_allocated)
1548			sk_sockets_allocated_inc(newsk);
1549
1550		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1551			net_enable_timestamp();
1552	}
1553out:
1554	return newsk;
1555}
1556EXPORT_SYMBOL_GPL(sk_clone_lock);
1557
1558void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1559{
1560	__sk_dst_set(sk, dst);
1561	sk->sk_route_caps = dst->dev->features;
1562	if (sk->sk_route_caps & NETIF_F_GSO)
1563		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1564	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1565	if (sk_can_gso(sk)) {
1566		if (dst->header_len) {
1567			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1568		} else {
1569			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1570			sk->sk_gso_max_size = dst->dev->gso_max_size;
1571			sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1572		}
1573	}
1574}
1575EXPORT_SYMBOL_GPL(sk_setup_caps);
1576
1577/*
1578 *	Simple resource managers for sockets.
1579 */
1580
1581
1582/*
1583 * Write buffer destructor automatically called from kfree_skb.
1584 */
1585void sock_wfree(struct sk_buff *skb)
1586{
1587	struct sock *sk = skb->sk;
1588	unsigned int len = skb->truesize;
1589
1590	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1591		/*
1592		 * Keep a reference on sk_wmem_alloc, this will be released
1593		 * after sk_write_space() call
1594		 */
1595		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1596		sk->sk_write_space(sk);
1597		len = 1;
1598	}
1599	/*
1600	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1601	 * could not do because of in-flight packets
1602	 */
1603	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1604		__sk_free(sk);
1605}
1606EXPORT_SYMBOL(sock_wfree);
1607
1608void skb_orphan_partial(struct sk_buff *skb)
1609{
1610	/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1611	 * so we do not completely orphan skb, but transfert all
1612	 * accounted bytes but one, to avoid unexpected reorders.
1613	 */
1614	if (skb->destructor == sock_wfree
1615#ifdef CONFIG_INET
1616	    || skb->destructor == tcp_wfree
1617#endif
1618		) {
1619		atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1620		skb->truesize = 1;
1621	} else {
1622		skb_orphan(skb);
1623	}
1624}
1625EXPORT_SYMBOL(skb_orphan_partial);
1626
1627/*
1628 * Read buffer destructor automatically called from kfree_skb.
1629 */
1630void sock_rfree(struct sk_buff *skb)
1631{
1632	struct sock *sk = skb->sk;
1633	unsigned int len = skb->truesize;
1634
1635	atomic_sub(len, &sk->sk_rmem_alloc);
1636	sk_mem_uncharge(sk, len);
1637}
1638EXPORT_SYMBOL(sock_rfree);
1639
1640void sock_edemux(struct sk_buff *skb)
1641{
1642	struct sock *sk = skb->sk;
1643
1644#ifdef CONFIG_INET
1645	if (sk->sk_state == TCP_TIME_WAIT)
1646		inet_twsk_put(inet_twsk(sk));
1647	else
1648#endif
1649		sock_put(sk);
1650}
1651EXPORT_SYMBOL(sock_edemux);
1652
1653kuid_t sock_i_uid(struct sock *sk)
1654{
1655	kuid_t uid;
1656
1657	read_lock_bh(&sk->sk_callback_lock);
1658	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1659	read_unlock_bh(&sk->sk_callback_lock);
1660	return uid;
1661}
1662EXPORT_SYMBOL(sock_i_uid);
1663
1664unsigned long sock_i_ino(struct sock *sk)
1665{
1666	unsigned long ino;
1667
1668	read_lock_bh(&sk->sk_callback_lock);
1669	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1670	read_unlock_bh(&sk->sk_callback_lock);
1671	return ino;
1672}
1673EXPORT_SYMBOL(sock_i_ino);
1674
1675/*
1676 * Allocate a skb from the socket's send buffer.
1677 */
1678struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1679			     gfp_t priority)
1680{
1681	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1682		struct sk_buff *skb = alloc_skb(size, priority);
1683		if (skb) {
1684			skb_set_owner_w(skb, sk);
1685			return skb;
1686		}
1687	}
1688	return NULL;
1689}
1690EXPORT_SYMBOL(sock_wmalloc);
1691
1692/*
1693 * Allocate a memory block from the socket's option memory buffer.
1694 */
1695void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1696{
1697	if ((unsigned int)size <= sysctl_optmem_max &&
1698	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1699		void *mem;
1700		/* First do the add, to avoid the race if kmalloc
1701		 * might sleep.
1702		 */
1703		atomic_add(size, &sk->sk_omem_alloc);
1704		mem = kmalloc(size, priority);
1705		if (mem)
1706			return mem;
1707		atomic_sub(size, &sk->sk_omem_alloc);
1708	}
1709	return NULL;
1710}
1711EXPORT_SYMBOL(sock_kmalloc);
1712
1713/*
1714 * Free an option memory block.
1715 */
1716void sock_kfree_s(struct sock *sk, void *mem, int size)
1717{
1718	kfree(mem);
1719	atomic_sub(size, &sk->sk_omem_alloc);
1720}
1721EXPORT_SYMBOL(sock_kfree_s);
1722
1723/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1724   I think, these locks should be removed for datagram sockets.
1725 */
1726static long sock_wait_for_wmem(struct sock *sk, long timeo)
1727{
1728	DEFINE_WAIT(wait);
1729
1730	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1731	for (;;) {
1732		if (!timeo)
1733			break;
1734		if (signal_pending(current))
1735			break;
1736		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1737		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1738		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1739			break;
1740		if (sk->sk_shutdown & SEND_SHUTDOWN)
1741			break;
1742		if (sk->sk_err)
1743			break;
1744		timeo = schedule_timeout(timeo);
1745	}
1746	finish_wait(sk_sleep(sk), &wait);
1747	return timeo;
1748}
1749
1750
1751/*
1752 *	Generic send/receive buffer handlers
1753 */
1754
1755struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1756				     unsigned long data_len, int noblock,
1757				     int *errcode, int max_page_order)
1758{
1759	struct sk_buff *skb = NULL;
1760	unsigned long chunk;
1761	gfp_t gfp_mask;
1762	long timeo;
1763	int err;
1764	int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1765	struct page *page;
1766	int i;
1767
1768	err = -EMSGSIZE;
1769	if (npages > MAX_SKB_FRAGS)
1770		goto failure;
1771
1772	timeo = sock_sndtimeo(sk, noblock);
1773	while (!skb) {
1774		err = sock_error(sk);
1775		if (err != 0)
1776			goto failure;
1777
1778		err = -EPIPE;
1779		if (sk->sk_shutdown & SEND_SHUTDOWN)
1780			goto failure;
1781
1782		if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) {
1783			set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1784			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1785			err = -EAGAIN;
1786			if (!timeo)
1787				goto failure;
1788			if (signal_pending(current))
1789				goto interrupted;
1790			timeo = sock_wait_for_wmem(sk, timeo);
1791			continue;
1792		}
1793
1794		err = -ENOBUFS;
1795		gfp_mask = sk->sk_allocation;
1796		if (gfp_mask & __GFP_WAIT)
1797			gfp_mask |= __GFP_REPEAT;
1798
1799		skb = alloc_skb(header_len, gfp_mask);
1800		if (!skb)
1801			goto failure;
1802
1803		skb->truesize += data_len;
1804
1805		for (i = 0; npages > 0; i++) {
1806			int order = max_page_order;
1807
1808			while (order) {
1809				if (npages >= 1 << order) {
1810					page = alloc_pages(sk->sk_allocation |
1811							   __GFP_COMP |
1812							   __GFP_NOWARN |
1813							   __GFP_NORETRY,
1814							   order);
1815					if (page)
1816						goto fill_page;
1817				}
1818				order--;
1819			}
1820			page = alloc_page(sk->sk_allocation);
1821			if (!page)
1822				goto failure;
1823fill_page:
1824			chunk = min_t(unsigned long, data_len,
1825				      PAGE_SIZE << order);
1826			skb_fill_page_desc(skb, i, page, 0, chunk);
1827			data_len -= chunk;
1828			npages -= 1 << order;
1829		}
1830	}
1831
1832	skb_set_owner_w(skb, sk);
1833	return skb;
1834
1835interrupted:
1836	err = sock_intr_errno(timeo);
1837failure:
1838	kfree_skb(skb);
1839	*errcode = err;
1840	return NULL;
1841}
1842EXPORT_SYMBOL(sock_alloc_send_pskb);
1843
1844struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1845				    int noblock, int *errcode)
1846{
1847	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1848}
1849EXPORT_SYMBOL(sock_alloc_send_skb);
1850
1851/* On 32bit arches, an skb frag is limited to 2^15 */
1852#define SKB_FRAG_PAGE_ORDER	get_order(32768)
1853
1854/**
1855 * skb_page_frag_refill - check that a page_frag contains enough room
1856 * @sz: minimum size of the fragment we want to get
1857 * @pfrag: pointer to page_frag
1858 * @prio: priority for memory allocation
1859 *
1860 * Note: While this allocator tries to use high order pages, there is
1861 * no guarantee that allocations succeed. Therefore, @sz MUST be
1862 * less or equal than PAGE_SIZE.
1863 */
1864bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
1865{
1866	int order;
1867
1868	if (pfrag->page) {
1869		if (atomic_read(&pfrag->page->_count) == 1) {
1870			pfrag->offset = 0;
1871			return true;
1872		}
1873		if (pfrag->offset + sz <= pfrag->size)
1874			return true;
1875		put_page(pfrag->page);
1876	}
1877
1878	order = SKB_FRAG_PAGE_ORDER;
1879	do {
1880		gfp_t gfp = prio;
1881
1882		if (order)
1883			gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
1884		pfrag->page = alloc_pages(gfp, order);
1885		if (likely(pfrag->page)) {
1886			pfrag->offset = 0;
1887			pfrag->size = PAGE_SIZE << order;
1888			return true;
1889		}
1890	} while (--order >= 0);
1891
1892	return false;
1893}
1894EXPORT_SYMBOL(skb_page_frag_refill);
1895
1896bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1897{
1898	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1899		return true;
1900
1901	sk_enter_memory_pressure(sk);
1902	sk_stream_moderate_sndbuf(sk);
1903	return false;
1904}
1905EXPORT_SYMBOL(sk_page_frag_refill);
1906
1907static void __lock_sock(struct sock *sk)
1908	__releases(&sk->sk_lock.slock)
1909	__acquires(&sk->sk_lock.slock)
1910{
1911	DEFINE_WAIT(wait);
1912
1913	for (;;) {
1914		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1915					TASK_UNINTERRUPTIBLE);
1916		spin_unlock_bh(&sk->sk_lock.slock);
1917		schedule();
1918		spin_lock_bh(&sk->sk_lock.slock);
1919		if (!sock_owned_by_user(sk))
1920			break;
1921	}
1922	finish_wait(&sk->sk_lock.wq, &wait);
1923}
1924
1925static void __release_sock(struct sock *sk)
1926	__releases(&sk->sk_lock.slock)
1927	__acquires(&sk->sk_lock.slock)
1928{
1929	struct sk_buff *skb = sk->sk_backlog.head;
1930
1931	do {
1932		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1933		bh_unlock_sock(sk);
1934
1935		do {
1936			struct sk_buff *next = skb->next;
1937
1938			prefetch(next);
1939			WARN_ON_ONCE(skb_dst_is_noref(skb));
1940			skb->next = NULL;
1941			sk_backlog_rcv(sk, skb);
1942
1943			/*
1944			 * We are in process context here with softirqs
1945			 * disabled, use cond_resched_softirq() to preempt.
1946			 * This is safe to do because we've taken the backlog
1947			 * queue private:
1948			 */
1949			cond_resched_softirq();
1950
1951			skb = next;
1952		} while (skb != NULL);
1953
1954		bh_lock_sock(sk);
1955	} while ((skb = sk->sk_backlog.head) != NULL);
1956
1957	/*
1958	 * Doing the zeroing here guarantee we can not loop forever
1959	 * while a wild producer attempts to flood us.
1960	 */
1961	sk->sk_backlog.len = 0;
1962}
1963
1964/**
1965 * sk_wait_data - wait for data to arrive at sk_receive_queue
1966 * @sk:    sock to wait on
1967 * @timeo: for how long
1968 *
1969 * Now socket state including sk->sk_err is changed only under lock,
1970 * hence we may omit checks after joining wait queue.
1971 * We check receive queue before schedule() only as optimization;
1972 * it is very likely that release_sock() added new data.
1973 */
1974int sk_wait_data(struct sock *sk, long *timeo)
1975{
1976	int rc;
1977	DEFINE_WAIT(wait);
1978
1979	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1980	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1981	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1982	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1983	finish_wait(sk_sleep(sk), &wait);
1984	return rc;
1985}
1986EXPORT_SYMBOL(sk_wait_data);
1987
1988/**
1989 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1990 *	@sk: socket
1991 *	@size: memory size to allocate
1992 *	@kind: allocation type
1993 *
1994 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1995 *	rmem allocation. This function assumes that protocols which have
1996 *	memory_pressure use sk_wmem_queued as write buffer accounting.
1997 */
1998int __sk_mem_schedule(struct sock *sk, int size, int kind)
1999{
2000	struct proto *prot = sk->sk_prot;
2001	int amt = sk_mem_pages(size);
2002	long allocated;
2003	int parent_status = UNDER_LIMIT;
2004
2005	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2006
2007	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
2008
2009	/* Under limit. */
2010	if (parent_status == UNDER_LIMIT &&
2011			allocated <= sk_prot_mem_limits(sk, 0)) {
2012		sk_leave_memory_pressure(sk);
2013		return 1;
2014	}
2015
2016	/* Under pressure. (we or our parents) */
2017	if ((parent_status > SOFT_LIMIT) ||
2018			allocated > sk_prot_mem_limits(sk, 1))
2019		sk_enter_memory_pressure(sk);
2020
2021	/* Over hard limit (we or our parents) */
2022	if ((parent_status == OVER_LIMIT) ||
2023			(allocated > sk_prot_mem_limits(sk, 2)))
2024		goto suppress_allocation;
2025
2026	/* guarantee minimum buffer size under pressure */
2027	if (kind == SK_MEM_RECV) {
2028		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2029			return 1;
2030
2031	} else { /* SK_MEM_SEND */
2032		if (sk->sk_type == SOCK_STREAM) {
2033			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2034				return 1;
2035		} else if (atomic_read(&sk->sk_wmem_alloc) <
2036			   prot->sysctl_wmem[0])
2037				return 1;
2038	}
2039
2040	if (sk_has_memory_pressure(sk)) {
2041		int alloc;
2042
2043		if (!sk_under_memory_pressure(sk))
2044			return 1;
2045		alloc = sk_sockets_allocated_read_positive(sk);
2046		if (sk_prot_mem_limits(sk, 2) > alloc *
2047		    sk_mem_pages(sk->sk_wmem_queued +
2048				 atomic_read(&sk->sk_rmem_alloc) +
2049				 sk->sk_forward_alloc))
2050			return 1;
2051	}
2052
2053suppress_allocation:
2054
2055	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2056		sk_stream_moderate_sndbuf(sk);
2057
2058		/* Fail only if socket is _under_ its sndbuf.
2059		 * In this case we cannot block, so that we have to fail.
2060		 */
2061		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2062			return 1;
2063	}
2064
2065	trace_sock_exceed_buf_limit(sk, prot, allocated);
2066
2067	/* Alas. Undo changes. */
2068	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2069
2070	sk_memory_allocated_sub(sk, amt);
2071
2072	return 0;
2073}
2074EXPORT_SYMBOL(__sk_mem_schedule);
2075
2076/**
2077 *	__sk_reclaim - reclaim memory_allocated
2078 *	@sk: socket
2079 */
2080void __sk_mem_reclaim(struct sock *sk)
2081{
2082	sk_memory_allocated_sub(sk,
2083				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2084	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2085
2086	if (sk_under_memory_pressure(sk) &&
2087	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2088		sk_leave_memory_pressure(sk);
2089}
2090EXPORT_SYMBOL(__sk_mem_reclaim);
2091
2092
2093/*
2094 * Set of default routines for initialising struct proto_ops when
2095 * the protocol does not support a particular function. In certain
2096 * cases where it makes no sense for a protocol to have a "do nothing"
2097 * function, some default processing is provided.
2098 */
2099
2100int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2101{
2102	return -EOPNOTSUPP;
2103}
2104EXPORT_SYMBOL(sock_no_bind);
2105
2106int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2107		    int len, int flags)
2108{
2109	return -EOPNOTSUPP;
2110}
2111EXPORT_SYMBOL(sock_no_connect);
2112
2113int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2114{
2115	return -EOPNOTSUPP;
2116}
2117EXPORT_SYMBOL(sock_no_socketpair);
2118
2119int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2120{
2121	return -EOPNOTSUPP;
2122}
2123EXPORT_SYMBOL(sock_no_accept);
2124
2125int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2126		    int *len, int peer)
2127{
2128	return -EOPNOTSUPP;
2129}
2130EXPORT_SYMBOL(sock_no_getname);
2131
2132unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2133{
2134	return 0;
2135}
2136EXPORT_SYMBOL(sock_no_poll);
2137
2138int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2139{
2140	return -EOPNOTSUPP;
2141}
2142EXPORT_SYMBOL(sock_no_ioctl);
2143
2144int sock_no_listen(struct socket *sock, int backlog)
2145{
2146	return -EOPNOTSUPP;
2147}
2148EXPORT_SYMBOL(sock_no_listen);
2149
2150int sock_no_shutdown(struct socket *sock, int how)
2151{
2152	return -EOPNOTSUPP;
2153}
2154EXPORT_SYMBOL(sock_no_shutdown);
2155
2156int sock_no_setsockopt(struct socket *sock, int level, int optname,
2157		    char __user *optval, unsigned int optlen)
2158{
2159	return -EOPNOTSUPP;
2160}
2161EXPORT_SYMBOL(sock_no_setsockopt);
2162
2163int sock_no_getsockopt(struct socket *sock, int level, int optname,
2164		    char __user *optval, int __user *optlen)
2165{
2166	return -EOPNOTSUPP;
2167}
2168EXPORT_SYMBOL(sock_no_getsockopt);
2169
2170int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2171		    size_t len)
2172{
2173	return -EOPNOTSUPP;
2174}
2175EXPORT_SYMBOL(sock_no_sendmsg);
2176
2177int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2178		    size_t len, int flags)
2179{
2180	return -EOPNOTSUPP;
2181}
2182EXPORT_SYMBOL(sock_no_recvmsg);
2183
2184int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2185{
2186	/* Mirror missing mmap method error code */
2187	return -ENODEV;
2188}
2189EXPORT_SYMBOL(sock_no_mmap);
2190
2191ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2192{
2193	ssize_t res;
2194	struct msghdr msg = {.msg_flags = flags};
2195	struct kvec iov;
2196	char *kaddr = kmap(page);
2197	iov.iov_base = kaddr + offset;
2198	iov.iov_len = size;
2199	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2200	kunmap(page);
2201	return res;
2202}
2203EXPORT_SYMBOL(sock_no_sendpage);
2204
2205/*
2206 *	Default Socket Callbacks
2207 */
2208
2209static void sock_def_wakeup(struct sock *sk)
2210{
2211	struct socket_wq *wq;
2212
2213	rcu_read_lock();
2214	wq = rcu_dereference(sk->sk_wq);
2215	if (wq_has_sleeper(wq))
2216		wake_up_interruptible_all(&wq->wait);
2217	rcu_read_unlock();
2218}
2219
2220static void sock_def_error_report(struct sock *sk)
2221{
2222	struct socket_wq *wq;
2223
2224	rcu_read_lock();
2225	wq = rcu_dereference(sk->sk_wq);
2226	if (wq_has_sleeper(wq))
2227		wake_up_interruptible_poll(&wq->wait, POLLERR);
2228	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2229	rcu_read_unlock();
2230}
2231
2232static void sock_def_readable(struct sock *sk)
2233{
2234	struct socket_wq *wq;
2235
2236	rcu_read_lock();
2237	wq = rcu_dereference(sk->sk_wq);
2238	if (wq_has_sleeper(wq))
2239		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2240						POLLRDNORM | POLLRDBAND);
2241	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2242	rcu_read_unlock();
2243}
2244
2245static void sock_def_write_space(struct sock *sk)
2246{
2247	struct socket_wq *wq;
2248
2249	rcu_read_lock();
2250
2251	/* Do not wake up a writer until he can make "significant"
2252	 * progress.  --DaveM
2253	 */
2254	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2255		wq = rcu_dereference(sk->sk_wq);
2256		if (wq_has_sleeper(wq))
2257			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2258						POLLWRNORM | POLLWRBAND);
2259
2260		/* Should agree with poll, otherwise some programs break */
2261		if (sock_writeable(sk))
2262			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2263	}
2264
2265	rcu_read_unlock();
2266}
2267
2268static void sock_def_destruct(struct sock *sk)
2269{
2270	kfree(sk->sk_protinfo);
2271}
2272
2273void sk_send_sigurg(struct sock *sk)
2274{
2275	if (sk->sk_socket && sk->sk_socket->file)
2276		if (send_sigurg(&sk->sk_socket->file->f_owner))
2277			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2278}
2279EXPORT_SYMBOL(sk_send_sigurg);
2280
2281void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2282		    unsigned long expires)
2283{
2284	if (!mod_timer(timer, expires))
2285		sock_hold(sk);
2286}
2287EXPORT_SYMBOL(sk_reset_timer);
2288
2289void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2290{
2291	if (del_timer(timer))
2292		__sock_put(sk);
2293}
2294EXPORT_SYMBOL(sk_stop_timer);
2295
2296void sock_init_data(struct socket *sock, struct sock *sk)
2297{
2298	skb_queue_head_init(&sk->sk_receive_queue);
2299	skb_queue_head_init(&sk->sk_write_queue);
2300	skb_queue_head_init(&sk->sk_error_queue);
2301#ifdef CONFIG_NET_DMA
2302	skb_queue_head_init(&sk->sk_async_wait_queue);
2303#endif
2304
2305	sk->sk_send_head	=	NULL;
2306
2307	init_timer(&sk->sk_timer);
2308
2309	sk->sk_allocation	=	GFP_KERNEL;
2310	sk->sk_rcvbuf		=	sysctl_rmem_default;
2311	sk->sk_sndbuf		=	sysctl_wmem_default;
2312	sk->sk_state		=	TCP_CLOSE;
2313	sk_set_socket(sk, sock);
2314
2315	sock_set_flag(sk, SOCK_ZAPPED);
2316
2317	if (sock) {
2318		sk->sk_type	=	sock->type;
2319		sk->sk_wq	=	sock->wq;
2320		sock->sk	=	sk;
2321	} else
2322		sk->sk_wq	=	NULL;
2323
2324	spin_lock_init(&sk->sk_dst_lock);
2325	rwlock_init(&sk->sk_callback_lock);
2326	lockdep_set_class_and_name(&sk->sk_callback_lock,
2327			af_callback_keys + sk->sk_family,
2328			af_family_clock_key_strings[sk->sk_family]);
2329
2330	sk->sk_state_change	=	sock_def_wakeup;
2331	sk->sk_data_ready	=	sock_def_readable;
2332	sk->sk_write_space	=	sock_def_write_space;
2333	sk->sk_error_report	=	sock_def_error_report;
2334	sk->sk_destruct		=	sock_def_destruct;
2335
2336	sk->sk_frag.page	=	NULL;
2337	sk->sk_frag.offset	=	0;
2338	sk->sk_peek_off		=	-1;
2339
2340	sk->sk_peer_pid 	=	NULL;
2341	sk->sk_peer_cred	=	NULL;
2342	sk->sk_write_pending	=	0;
2343	sk->sk_rcvlowat		=	1;
2344	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2345	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2346
2347	sk->sk_stamp = ktime_set(-1L, 0);
2348
2349#ifdef CONFIG_NET_RX_BUSY_POLL
2350	sk->sk_napi_id		=	0;
2351	sk->sk_ll_usec		=	sysctl_net_busy_read;
2352#endif
2353
2354	sk->sk_max_pacing_rate = ~0U;
2355	sk->sk_pacing_rate = ~0U;
2356	/*
2357	 * Before updating sk_refcnt, we must commit prior changes to memory
2358	 * (Documentation/RCU/rculist_nulls.txt for details)
2359	 */
2360	smp_wmb();
2361	atomic_set(&sk->sk_refcnt, 1);
2362	atomic_set(&sk->sk_drops, 0);
2363}
2364EXPORT_SYMBOL(sock_init_data);
2365
2366void lock_sock_nested(struct sock *sk, int subclass)
2367{
2368	might_sleep();
2369	spin_lock_bh(&sk->sk_lock.slock);
2370	if (sk->sk_lock.owned)
2371		__lock_sock(sk);
2372	sk->sk_lock.owned = 1;
2373	spin_unlock(&sk->sk_lock.slock);
2374	/*
2375	 * The sk_lock has mutex_lock() semantics here:
2376	 */
2377	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2378	local_bh_enable();
2379}
2380EXPORT_SYMBOL(lock_sock_nested);
2381
2382void release_sock(struct sock *sk)
2383{
2384	/*
2385	 * The sk_lock has mutex_unlock() semantics:
2386	 */
2387	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2388
2389	spin_lock_bh(&sk->sk_lock.slock);
2390	if (sk->sk_backlog.tail)
2391		__release_sock(sk);
2392
2393	/* Warning : release_cb() might need to release sk ownership,
2394	 * ie call sock_release_ownership(sk) before us.
2395	 */
2396	if (sk->sk_prot->release_cb)
2397		sk->sk_prot->release_cb(sk);
2398
2399	sock_release_ownership(sk);
2400	if (waitqueue_active(&sk->sk_lock.wq))
2401		wake_up(&sk->sk_lock.wq);
2402	spin_unlock_bh(&sk->sk_lock.slock);
2403}
2404EXPORT_SYMBOL(release_sock);
2405
2406/**
2407 * lock_sock_fast - fast version of lock_sock
2408 * @sk: socket
2409 *
2410 * This version should be used for very small section, where process wont block
2411 * return false if fast path is taken
2412 *   sk_lock.slock locked, owned = 0, BH disabled
2413 * return true if slow path is taken
2414 *   sk_lock.slock unlocked, owned = 1, BH enabled
2415 */
2416bool lock_sock_fast(struct sock *sk)
2417{
2418	might_sleep();
2419	spin_lock_bh(&sk->sk_lock.slock);
2420
2421	if (!sk->sk_lock.owned)
2422		/*
2423		 * Note : We must disable BH
2424		 */
2425		return false;
2426
2427	__lock_sock(sk);
2428	sk->sk_lock.owned = 1;
2429	spin_unlock(&sk->sk_lock.slock);
2430	/*
2431	 * The sk_lock has mutex_lock() semantics here:
2432	 */
2433	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2434	local_bh_enable();
2435	return true;
2436}
2437EXPORT_SYMBOL(lock_sock_fast);
2438
2439int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2440{
2441	struct timeval tv;
2442	if (!sock_flag(sk, SOCK_TIMESTAMP))
2443		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2444	tv = ktime_to_timeval(sk->sk_stamp);
2445	if (tv.tv_sec == -1)
2446		return -ENOENT;
2447	if (tv.tv_sec == 0) {
2448		sk->sk_stamp = ktime_get_real();
2449		tv = ktime_to_timeval(sk->sk_stamp);
2450	}
2451	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2452}
2453EXPORT_SYMBOL(sock_get_timestamp);
2454
2455int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2456{
2457	struct timespec ts;
2458	if (!sock_flag(sk, SOCK_TIMESTAMP))
2459		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2460	ts = ktime_to_timespec(sk->sk_stamp);
2461	if (ts.tv_sec == -1)
2462		return -ENOENT;
2463	if (ts.tv_sec == 0) {
2464		sk->sk_stamp = ktime_get_real();
2465		ts = ktime_to_timespec(sk->sk_stamp);
2466	}
2467	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2468}
2469EXPORT_SYMBOL(sock_get_timestampns);
2470
2471void sock_enable_timestamp(struct sock *sk, int flag)
2472{
2473	if (!sock_flag(sk, flag)) {
2474		unsigned long previous_flags = sk->sk_flags;
2475
2476		sock_set_flag(sk, flag);
2477		/*
2478		 * we just set one of the two flags which require net
2479		 * time stamping, but time stamping might have been on
2480		 * already because of the other one
2481		 */
2482		if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2483			net_enable_timestamp();
2484	}
2485}
2486
2487int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2488		       int level, int type)
2489{
2490	struct sock_exterr_skb *serr;
2491	struct sk_buff *skb, *skb2;
2492	int copied, err;
2493
2494	err = -EAGAIN;
2495	skb = skb_dequeue(&sk->sk_error_queue);
2496	if (skb == NULL)
2497		goto out;
2498
2499	copied = skb->len;
2500	if (copied > len) {
2501		msg->msg_flags |= MSG_TRUNC;
2502		copied = len;
2503	}
2504	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2505	if (err)
2506		goto out_free_skb;
2507
2508	sock_recv_timestamp(msg, sk, skb);
2509
2510	serr = SKB_EXT_ERR(skb);
2511	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2512
2513	msg->msg_flags |= MSG_ERRQUEUE;
2514	err = copied;
2515
2516	/* Reset and regenerate socket error */
2517	spin_lock_bh(&sk->sk_error_queue.lock);
2518	sk->sk_err = 0;
2519	if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2520		sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2521		spin_unlock_bh(&sk->sk_error_queue.lock);
2522		sk->sk_error_report(sk);
2523	} else
2524		spin_unlock_bh(&sk->sk_error_queue.lock);
2525
2526out_free_skb:
2527	kfree_skb(skb);
2528out:
2529	return err;
2530}
2531EXPORT_SYMBOL(sock_recv_errqueue);
2532
2533/*
2534 *	Get a socket option on an socket.
2535 *
2536 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2537 *	asynchronous errors should be reported by getsockopt. We assume
2538 *	this means if you specify SO_ERROR (otherwise whats the point of it).
2539 */
2540int sock_common_getsockopt(struct socket *sock, int level, int optname,
2541			   char __user *optval, int __user *optlen)
2542{
2543	struct sock *sk = sock->sk;
2544
2545	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2546}
2547EXPORT_SYMBOL(sock_common_getsockopt);
2548
2549#ifdef CONFIG_COMPAT
2550int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2551				  char __user *optval, int __user *optlen)
2552{
2553	struct sock *sk = sock->sk;
2554
2555	if (sk->sk_prot->compat_getsockopt != NULL)
2556		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2557						      optval, optlen);
2558	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2559}
2560EXPORT_SYMBOL(compat_sock_common_getsockopt);
2561#endif
2562
2563int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2564			struct msghdr *msg, size_t size, int flags)
2565{
2566	struct sock *sk = sock->sk;
2567	int addr_len = 0;
2568	int err;
2569
2570	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2571				   flags & ~MSG_DONTWAIT, &addr_len);
2572	if (err >= 0)
2573		msg->msg_namelen = addr_len;
2574	return err;
2575}
2576EXPORT_SYMBOL(sock_common_recvmsg);
2577
2578/*
2579 *	Set socket options on an inet socket.
2580 */
2581int sock_common_setsockopt(struct socket *sock, int level, int optname,
2582			   char __user *optval, unsigned int optlen)
2583{
2584	struct sock *sk = sock->sk;
2585
2586	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2587}
2588EXPORT_SYMBOL(sock_common_setsockopt);
2589
2590#ifdef CONFIG_COMPAT
2591int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2592				  char __user *optval, unsigned int optlen)
2593{
2594	struct sock *sk = sock->sk;
2595
2596	if (sk->sk_prot->compat_setsockopt != NULL)
2597		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2598						      optval, optlen);
2599	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2600}
2601EXPORT_SYMBOL(compat_sock_common_setsockopt);
2602#endif
2603
2604void sk_common_release(struct sock *sk)
2605{
2606	if (sk->sk_prot->destroy)
2607		sk->sk_prot->destroy(sk);
2608
2609	/*
2610	 * Observation: when sock_common_release is called, processes have
2611	 * no access to socket. But net still has.
2612	 * Step one, detach it from networking:
2613	 *
2614	 * A. Remove from hash tables.
2615	 */
2616
2617	sk->sk_prot->unhash(sk);
2618
2619	/*
2620	 * In this point socket cannot receive new packets, but it is possible
2621	 * that some packets are in flight because some CPU runs receiver and
2622	 * did hash table lookup before we unhashed socket. They will achieve
2623	 * receive queue and will be purged by socket destructor.
2624	 *
2625	 * Also we still have packets pending on receive queue and probably,
2626	 * our own packets waiting in device queues. sock_destroy will drain
2627	 * receive queue, but transmitted packets will delay socket destruction
2628	 * until the last reference will be released.
2629	 */
2630
2631	sock_orphan(sk);
2632
2633	xfrm_sk_free_policy(sk);
2634
2635	sk_refcnt_debug_release(sk);
2636
2637	if (sk->sk_frag.page) {
2638		put_page(sk->sk_frag.page);
2639		sk->sk_frag.page = NULL;
2640	}
2641
2642	sock_put(sk);
2643}
2644EXPORT_SYMBOL(sk_common_release);
2645
2646#ifdef CONFIG_PROC_FS
2647#define PROTO_INUSE_NR	64	/* should be enough for the first time */
2648struct prot_inuse {
2649	int val[PROTO_INUSE_NR];
2650};
2651
2652static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2653
2654#ifdef CONFIG_NET_NS
2655void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2656{
2657	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2658}
2659EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2660
2661int sock_prot_inuse_get(struct net *net, struct proto *prot)
2662{
2663	int cpu, idx = prot->inuse_idx;
2664	int res = 0;
2665
2666	for_each_possible_cpu(cpu)
2667		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2668
2669	return res >= 0 ? res : 0;
2670}
2671EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2672
2673static int __net_init sock_inuse_init_net(struct net *net)
2674{
2675	net->core.inuse = alloc_percpu(struct prot_inuse);
2676	return net->core.inuse ? 0 : -ENOMEM;
2677}
2678
2679static void __net_exit sock_inuse_exit_net(struct net *net)
2680{
2681	free_percpu(net->core.inuse);
2682}
2683
2684static struct pernet_operations net_inuse_ops = {
2685	.init = sock_inuse_init_net,
2686	.exit = sock_inuse_exit_net,
2687};
2688
2689static __init int net_inuse_init(void)
2690{
2691	if (register_pernet_subsys(&net_inuse_ops))
2692		panic("Cannot initialize net inuse counters");
2693
2694	return 0;
2695}
2696
2697core_initcall(net_inuse_init);
2698#else
2699static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2700
2701void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2702{
2703	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2704}
2705EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2706
2707int sock_prot_inuse_get(struct net *net, struct proto *prot)
2708{
2709	int cpu, idx = prot->inuse_idx;
2710	int res = 0;
2711
2712	for_each_possible_cpu(cpu)
2713		res += per_cpu(prot_inuse, cpu).val[idx];
2714
2715	return res >= 0 ? res : 0;
2716}
2717EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2718#endif
2719
2720static void assign_proto_idx(struct proto *prot)
2721{
2722	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2723
2724	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2725		pr_err("PROTO_INUSE_NR exhausted\n");
2726		return;
2727	}
2728
2729	set_bit(prot->inuse_idx, proto_inuse_idx);
2730}
2731
2732static void release_proto_idx(struct proto *prot)
2733{
2734	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2735		clear_bit(prot->inuse_idx, proto_inuse_idx);
2736}
2737#else
2738static inline void assign_proto_idx(struct proto *prot)
2739{
2740}
2741
2742static inline void release_proto_idx(struct proto *prot)
2743{
2744}
2745#endif
2746
2747int proto_register(struct proto *prot, int alloc_slab)
2748{
2749	if (alloc_slab) {
2750		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2751					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2752					NULL);
2753
2754		if (prot->slab == NULL) {
2755			pr_crit("%s: Can't create sock SLAB cache!\n",
2756				prot->name);
2757			goto out;
2758		}
2759
2760		if (prot->rsk_prot != NULL) {
2761			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2762			if (prot->rsk_prot->slab_name == NULL)
2763				goto out_free_sock_slab;
2764
2765			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2766								 prot->rsk_prot->obj_size, 0,
2767								 SLAB_HWCACHE_ALIGN, NULL);
2768
2769			if (prot->rsk_prot->slab == NULL) {
2770				pr_crit("%s: Can't create request sock SLAB cache!\n",
2771					prot->name);
2772				goto out_free_request_sock_slab_name;
2773			}
2774		}
2775
2776		if (prot->twsk_prot != NULL) {
2777			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2778
2779			if (prot->twsk_prot->twsk_slab_name == NULL)
2780				goto out_free_request_sock_slab;
2781
2782			prot->twsk_prot->twsk_slab =
2783				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2784						  prot->twsk_prot->twsk_obj_size,
2785						  0,
2786						  SLAB_HWCACHE_ALIGN |
2787							prot->slab_flags,
2788						  NULL);
2789			if (prot->twsk_prot->twsk_slab == NULL)
2790				goto out_free_timewait_sock_slab_name;
2791		}
2792	}
2793
2794	mutex_lock(&proto_list_mutex);
2795	list_add(&prot->node, &proto_list);
2796	assign_proto_idx(prot);
2797	mutex_unlock(&proto_list_mutex);
2798	return 0;
2799
2800out_free_timewait_sock_slab_name:
2801	kfree(prot->twsk_prot->twsk_slab_name);
2802out_free_request_sock_slab:
2803	if (prot->rsk_prot && prot->rsk_prot->slab) {
2804		kmem_cache_destroy(prot->rsk_prot->slab);
2805		prot->rsk_prot->slab = NULL;
2806	}
2807out_free_request_sock_slab_name:
2808	if (prot->rsk_prot)
2809		kfree(prot->rsk_prot->slab_name);
2810out_free_sock_slab:
2811	kmem_cache_destroy(prot->slab);
2812	prot->slab = NULL;
2813out:
2814	return -ENOBUFS;
2815}
2816EXPORT_SYMBOL(proto_register);
2817
2818void proto_unregister(struct proto *prot)
2819{
2820	mutex_lock(&proto_list_mutex);
2821	release_proto_idx(prot);
2822	list_del(&prot->node);
2823	mutex_unlock(&proto_list_mutex);
2824
2825	if (prot->slab != NULL) {
2826		kmem_cache_destroy(prot->slab);
2827		prot->slab = NULL;
2828	}
2829
2830	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2831		kmem_cache_destroy(prot->rsk_prot->slab);
2832		kfree(prot->rsk_prot->slab_name);
2833		prot->rsk_prot->slab = NULL;
2834	}
2835
2836	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2837		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2838		kfree(prot->twsk_prot->twsk_slab_name);
2839		prot->twsk_prot->twsk_slab = NULL;
2840	}
2841}
2842EXPORT_SYMBOL(proto_unregister);
2843
2844#ifdef CONFIG_PROC_FS
2845static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2846	__acquires(proto_list_mutex)
2847{
2848	mutex_lock(&proto_list_mutex);
2849	return seq_list_start_head(&proto_list, *pos);
2850}
2851
2852static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2853{
2854	return seq_list_next(v, &proto_list, pos);
2855}
2856
2857static void proto_seq_stop(struct seq_file *seq, void *v)
2858	__releases(proto_list_mutex)
2859{
2860	mutex_unlock(&proto_list_mutex);
2861}
2862
2863static char proto_method_implemented(const void *method)
2864{
2865	return method == NULL ? 'n' : 'y';
2866}
2867static long sock_prot_memory_allocated(struct proto *proto)
2868{
2869	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2870}
2871
2872static char *sock_prot_memory_pressure(struct proto *proto)
2873{
2874	return proto->memory_pressure != NULL ?
2875	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2876}
2877
2878static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2879{
2880
2881	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2882			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2883		   proto->name,
2884		   proto->obj_size,
2885		   sock_prot_inuse_get(seq_file_net(seq), proto),
2886		   sock_prot_memory_allocated(proto),
2887		   sock_prot_memory_pressure(proto),
2888		   proto->max_header,
2889		   proto->slab == NULL ? "no" : "yes",
2890		   module_name(proto->owner),
2891		   proto_method_implemented(proto->close),
2892		   proto_method_implemented(proto->connect),
2893		   proto_method_implemented(proto->disconnect),
2894		   proto_method_implemented(proto->accept),
2895		   proto_method_implemented(proto->ioctl),
2896		   proto_method_implemented(proto->init),
2897		   proto_method_implemented(proto->destroy),
2898		   proto_method_implemented(proto->shutdown),
2899		   proto_method_implemented(proto->setsockopt),
2900		   proto_method_implemented(proto->getsockopt),
2901		   proto_method_implemented(proto->sendmsg),
2902		   proto_method_implemented(proto->recvmsg),
2903		   proto_method_implemented(proto->sendpage),
2904		   proto_method_implemented(proto->bind),
2905		   proto_method_implemented(proto->backlog_rcv),
2906		   proto_method_implemented(proto->hash),
2907		   proto_method_implemented(proto->unhash),
2908		   proto_method_implemented(proto->get_port),
2909		   proto_method_implemented(proto->enter_memory_pressure));
2910}
2911
2912static int proto_seq_show(struct seq_file *seq, void *v)
2913{
2914	if (v == &proto_list)
2915		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2916			   "protocol",
2917			   "size",
2918			   "sockets",
2919			   "memory",
2920			   "press",
2921			   "maxhdr",
2922			   "slab",
2923			   "module",
2924			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2925	else
2926		proto_seq_printf(seq, list_entry(v, struct proto, node));
2927	return 0;
2928}
2929
2930static const struct seq_operations proto_seq_ops = {
2931	.start  = proto_seq_start,
2932	.next   = proto_seq_next,
2933	.stop   = proto_seq_stop,
2934	.show   = proto_seq_show,
2935};
2936
2937static int proto_seq_open(struct inode *inode, struct file *file)
2938{
2939	return seq_open_net(inode, file, &proto_seq_ops,
2940			    sizeof(struct seq_net_private));
2941}
2942
2943static const struct file_operations proto_seq_fops = {
2944	.owner		= THIS_MODULE,
2945	.open		= proto_seq_open,
2946	.read		= seq_read,
2947	.llseek		= seq_lseek,
2948	.release	= seq_release_net,
2949};
2950
2951static __net_init int proto_init_net(struct net *net)
2952{
2953	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2954		return -ENOMEM;
2955
2956	return 0;
2957}
2958
2959static __net_exit void proto_exit_net(struct net *net)
2960{
2961	remove_proc_entry("protocols", net->proc_net);
2962}
2963
2964
2965static __net_initdata struct pernet_operations proto_net_ops = {
2966	.init = proto_init_net,
2967	.exit = proto_exit_net,
2968};
2969
2970static int __init proto_init(void)
2971{
2972	return register_pernet_subsys(&proto_net_ops);
2973}
2974
2975subsys_initcall(proto_init);
2976
2977#endif /* PROC_FS */
2978