sock.c revision 86f8515f9721fa171483f0fe0391968fbb949cc9
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 *		Alan Cox	: 	Numerous verify_area() problems
17 *		Alan Cox	:	Connecting on a connecting socket
18 *					now returns an error for tcp.
19 *		Alan Cox	:	sock->protocol is set correctly.
20 *					and is not sometimes left as 0.
21 *		Alan Cox	:	connect handles icmp errors on a
22 *					connect properly. Unfortunately there
23 *					is a restart syscall nasty there. I
24 *					can't match BSD without hacking the C
25 *					library. Ideas urgently sought!
26 *		Alan Cox	:	Disallow bind() to addresses that are
27 *					not ours - especially broadcast ones!!
28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30 *					instead they leave that for the DESTROY timer.
31 *		Alan Cox	:	Clean up error flag in accept
32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33 *					was buggy. Put a remove_sock() in the handler
34 *					for memory when we hit 0. Also altered the timer
35 *					code. The ACK stuff can wait and needs major
36 *					TCP layer surgery.
37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38 *					and fixed timer/inet_bh race.
39 *		Alan Cox	:	Added zapped flag for TCP
40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47 *	Pauline Middelink	:	identd support
48 *		Alan Cox	:	Fixed connect() taking signals I think.
49 *		Alan Cox	:	SO_LINGER supported
50 *		Alan Cox	:	Error reporting fixes
51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52 *		Alan Cox	:	inet sockets don't set sk->type!
53 *		Alan Cox	:	Split socket option code
54 *		Alan Cox	:	Callbacks
55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56 *		Alex		:	Removed restriction on inet fioctl
57 *		Alan Cox	:	Splitting INET from NET core
58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60 *		Alan Cox	:	Split IP from generic code
61 *		Alan Cox	:	New kfree_skbmem()
62 *		Alan Cox	:	Make SO_DEBUG superuser only.
63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64 *					(compatibility fix)
65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66 *		Alan Cox	:	Allocator for a socket is settable.
67 *		Alan Cox	:	SO_ERROR includes soft errors.
68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69 *		Alan Cox	: 	Generic socket allocation to make hooks
70 *					easier (suggested by Craig Metz).
71 *		Michael Pall	:	SO_ERROR returns positive errno again
72 *              Steve Whitehouse:       Added default destructor to free
73 *                                      protocol private data.
74 *              Steve Whitehouse:       Added various other default routines
75 *                                      common to several socket families.
76 *              Chris Evans     :       Call suser() check last on F_SETOWN
77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79 *		Andi Kleen	:	Fix write_space callback
80 *		Chris Evans	:	Security fixes - signedness again
81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 *		This program is free software; you can redistribute it and/or
87 *		modify it under the terms of the GNU General Public License
88 *		as published by the Free Software Foundation; either version
89 *		2 of the License, or (at your option) any later version.
90 */
91
92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
94#include <linux/capability.h>
95#include <linux/errno.h>
96#include <linux/errqueue.h>
97#include <linux/types.h>
98#include <linux/socket.h>
99#include <linux/in.h>
100#include <linux/kernel.h>
101#include <linux/module.h>
102#include <linux/proc_fs.h>
103#include <linux/seq_file.h>
104#include <linux/sched.h>
105#include <linux/timer.h>
106#include <linux/string.h>
107#include <linux/sockios.h>
108#include <linux/net.h>
109#include <linux/mm.h>
110#include <linux/slab.h>
111#include <linux/interrupt.h>
112#include <linux/poll.h>
113#include <linux/tcp.h>
114#include <linux/init.h>
115#include <linux/highmem.h>
116#include <linux/user_namespace.h>
117#include <linux/static_key.h>
118#include <linux/memcontrol.h>
119#include <linux/prefetch.h>
120
121#include <asm/uaccess.h>
122
123#include <linux/netdevice.h>
124#include <net/protocol.h>
125#include <linux/skbuff.h>
126#include <net/net_namespace.h>
127#include <net/request_sock.h>
128#include <net/sock.h>
129#include <linux/net_tstamp.h>
130#include <net/xfrm.h>
131#include <linux/ipsec.h>
132#include <net/cls_cgroup.h>
133#include <net/netprio_cgroup.h>
134
135#include <linux/filter.h>
136
137#include <trace/events/sock.h>
138
139#ifdef CONFIG_INET
140#include <net/tcp.h>
141#endif
142
143#include <net/busy_poll.h>
144
145static DEFINE_MUTEX(proto_list_mutex);
146static LIST_HEAD(proto_list);
147
148#ifdef CONFIG_MEMCG_KMEM
149int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
150{
151	struct proto *proto;
152	int ret = 0;
153
154	mutex_lock(&proto_list_mutex);
155	list_for_each_entry(proto, &proto_list, node) {
156		if (proto->init_cgroup) {
157			ret = proto->init_cgroup(memcg, ss);
158			if (ret)
159				goto out;
160		}
161	}
162
163	mutex_unlock(&proto_list_mutex);
164	return ret;
165out:
166	list_for_each_entry_continue_reverse(proto, &proto_list, node)
167		if (proto->destroy_cgroup)
168			proto->destroy_cgroup(memcg);
169	mutex_unlock(&proto_list_mutex);
170	return ret;
171}
172
173void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
174{
175	struct proto *proto;
176
177	mutex_lock(&proto_list_mutex);
178	list_for_each_entry_reverse(proto, &proto_list, node)
179		if (proto->destroy_cgroup)
180			proto->destroy_cgroup(memcg);
181	mutex_unlock(&proto_list_mutex);
182}
183#endif
184
185/*
186 * Each address family might have different locking rules, so we have
187 * one slock key per address family:
188 */
189static struct lock_class_key af_family_keys[AF_MAX];
190static struct lock_class_key af_family_slock_keys[AF_MAX];
191
192#if defined(CONFIG_MEMCG_KMEM)
193struct static_key memcg_socket_limit_enabled;
194EXPORT_SYMBOL(memcg_socket_limit_enabled);
195#endif
196
197/*
198 * Make lock validator output more readable. (we pre-construct these
199 * strings build-time, so that runtime initialization of socket
200 * locks is fast):
201 */
202static const char *const af_family_key_strings[AF_MAX+1] = {
203  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
204  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
205  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
206  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
207  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
208  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
209  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
210  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
211  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
212  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
213  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
214  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
215  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
216  "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
217};
218static const char *const af_family_slock_key_strings[AF_MAX+1] = {
219  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
220  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
221  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
222  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
223  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
224  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
225  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
226  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
227  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
228  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
229  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
230  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
231  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
232  "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
233};
234static const char *const af_family_clock_key_strings[AF_MAX+1] = {
235  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
236  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
237  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
238  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
239  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
240  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
241  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
242  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
243  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
244  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
245  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
246  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
247  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
248  "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
249};
250
251/*
252 * sk_callback_lock locking rules are per-address-family,
253 * so split the lock classes by using a per-AF key:
254 */
255static struct lock_class_key af_callback_keys[AF_MAX];
256
257/* Take into consideration the size of the struct sk_buff overhead in the
258 * determination of these values, since that is non-constant across
259 * platforms.  This makes socket queueing behavior and performance
260 * not depend upon such differences.
261 */
262#define _SK_MEM_PACKETS		256
263#define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
264#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
265#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
266
267/* Run time adjustable parameters. */
268__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
269EXPORT_SYMBOL(sysctl_wmem_max);
270__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
271EXPORT_SYMBOL(sysctl_rmem_max);
272__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
273__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
274
275/* Maximal space eaten by iovec or ancillary data plus some space */
276int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
277EXPORT_SYMBOL(sysctl_optmem_max);
278
279struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
280EXPORT_SYMBOL_GPL(memalloc_socks);
281
282/**
283 * sk_set_memalloc - sets %SOCK_MEMALLOC
284 * @sk: socket to set it on
285 *
286 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
287 * It's the responsibility of the admin to adjust min_free_kbytes
288 * to meet the requirements
289 */
290void sk_set_memalloc(struct sock *sk)
291{
292	sock_set_flag(sk, SOCK_MEMALLOC);
293	sk->sk_allocation |= __GFP_MEMALLOC;
294	static_key_slow_inc(&memalloc_socks);
295}
296EXPORT_SYMBOL_GPL(sk_set_memalloc);
297
298void sk_clear_memalloc(struct sock *sk)
299{
300	sock_reset_flag(sk, SOCK_MEMALLOC);
301	sk->sk_allocation &= ~__GFP_MEMALLOC;
302	static_key_slow_dec(&memalloc_socks);
303
304	/*
305	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
306	 * progress of swapping. However, if SOCK_MEMALLOC is cleared while
307	 * it has rmem allocations there is a risk that the user of the
308	 * socket cannot make forward progress due to exceeding the rmem
309	 * limits. By rights, sk_clear_memalloc() should only be called
310	 * on sockets being torn down but warn and reset the accounting if
311	 * that assumption breaks.
312	 */
313	if (WARN_ON(sk->sk_forward_alloc))
314		sk_mem_reclaim(sk);
315}
316EXPORT_SYMBOL_GPL(sk_clear_memalloc);
317
318int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
319{
320	int ret;
321	unsigned long pflags = current->flags;
322
323	/* these should have been dropped before queueing */
324	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
325
326	current->flags |= PF_MEMALLOC;
327	ret = sk->sk_backlog_rcv(sk, skb);
328	tsk_restore_flags(current, pflags, PF_MEMALLOC);
329
330	return ret;
331}
332EXPORT_SYMBOL(__sk_backlog_rcv);
333
334static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
335{
336	struct timeval tv;
337
338	if (optlen < sizeof(tv))
339		return -EINVAL;
340	if (copy_from_user(&tv, optval, sizeof(tv)))
341		return -EFAULT;
342	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
343		return -EDOM;
344
345	if (tv.tv_sec < 0) {
346		static int warned __read_mostly;
347
348		*timeo_p = 0;
349		if (warned < 10 && net_ratelimit()) {
350			warned++;
351			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
352				__func__, current->comm, task_pid_nr(current));
353		}
354		return 0;
355	}
356	*timeo_p = MAX_SCHEDULE_TIMEOUT;
357	if (tv.tv_sec == 0 && tv.tv_usec == 0)
358		return 0;
359	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
360		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
361	return 0;
362}
363
364static void sock_warn_obsolete_bsdism(const char *name)
365{
366	static int warned;
367	static char warncomm[TASK_COMM_LEN];
368	if (strcmp(warncomm, current->comm) && warned < 5) {
369		strcpy(warncomm,  current->comm);
370		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
371			warncomm, name);
372		warned++;
373	}
374}
375
376#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
377
378static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
379{
380	if (sk->sk_flags & flags) {
381		sk->sk_flags &= ~flags;
382		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
383			net_disable_timestamp();
384	}
385}
386
387
388int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
389{
390	int err;
391	int skb_len;
392	unsigned long flags;
393	struct sk_buff_head *list = &sk->sk_receive_queue;
394
395	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
396		atomic_inc(&sk->sk_drops);
397		trace_sock_rcvqueue_full(sk, skb);
398		return -ENOMEM;
399	}
400
401	err = sk_filter(sk, skb);
402	if (err)
403		return err;
404
405	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
406		atomic_inc(&sk->sk_drops);
407		return -ENOBUFS;
408	}
409
410	skb->dev = NULL;
411	skb_set_owner_r(skb, sk);
412
413	/* Cache the SKB length before we tack it onto the receive
414	 * queue.  Once it is added it no longer belongs to us and
415	 * may be freed by other threads of control pulling packets
416	 * from the queue.
417	 */
418	skb_len = skb->len;
419
420	/* we escape from rcu protected region, make sure we dont leak
421	 * a norefcounted dst
422	 */
423	skb_dst_force(skb);
424
425	spin_lock_irqsave(&list->lock, flags);
426	skb->dropcount = atomic_read(&sk->sk_drops);
427	__skb_queue_tail(list, skb);
428	spin_unlock_irqrestore(&list->lock, flags);
429
430	if (!sock_flag(sk, SOCK_DEAD))
431		sk->sk_data_ready(sk, skb_len);
432	return 0;
433}
434EXPORT_SYMBOL(sock_queue_rcv_skb);
435
436int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
437{
438	int rc = NET_RX_SUCCESS;
439
440	if (sk_filter(sk, skb))
441		goto discard_and_relse;
442
443	skb->dev = NULL;
444
445	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
446		atomic_inc(&sk->sk_drops);
447		goto discard_and_relse;
448	}
449	if (nested)
450		bh_lock_sock_nested(sk);
451	else
452		bh_lock_sock(sk);
453	if (!sock_owned_by_user(sk)) {
454		/*
455		 * trylock + unlock semantics:
456		 */
457		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
458
459		rc = sk_backlog_rcv(sk, skb);
460
461		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
462	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
463		bh_unlock_sock(sk);
464		atomic_inc(&sk->sk_drops);
465		goto discard_and_relse;
466	}
467
468	bh_unlock_sock(sk);
469out:
470	sock_put(sk);
471	return rc;
472discard_and_relse:
473	kfree_skb(skb);
474	goto out;
475}
476EXPORT_SYMBOL(sk_receive_skb);
477
478struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
479{
480	struct dst_entry *dst = __sk_dst_get(sk);
481
482	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
483		sk_tx_queue_clear(sk);
484		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
485		dst_release(dst);
486		return NULL;
487	}
488
489	return dst;
490}
491EXPORT_SYMBOL(__sk_dst_check);
492
493struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
494{
495	struct dst_entry *dst = sk_dst_get(sk);
496
497	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
498		sk_dst_reset(sk);
499		dst_release(dst);
500		return NULL;
501	}
502
503	return dst;
504}
505EXPORT_SYMBOL(sk_dst_check);
506
507static int sock_setbindtodevice(struct sock *sk, char __user *optval,
508				int optlen)
509{
510	int ret = -ENOPROTOOPT;
511#ifdef CONFIG_NETDEVICES
512	struct net *net = sock_net(sk);
513	char devname[IFNAMSIZ];
514	int index;
515
516	/* Sorry... */
517	ret = -EPERM;
518	if (!ns_capable(net->user_ns, CAP_NET_RAW))
519		goto out;
520
521	ret = -EINVAL;
522	if (optlen < 0)
523		goto out;
524
525	/* Bind this socket to a particular device like "eth0",
526	 * as specified in the passed interface name. If the
527	 * name is "" or the option length is zero the socket
528	 * is not bound.
529	 */
530	if (optlen > IFNAMSIZ - 1)
531		optlen = IFNAMSIZ - 1;
532	memset(devname, 0, sizeof(devname));
533
534	ret = -EFAULT;
535	if (copy_from_user(devname, optval, optlen))
536		goto out;
537
538	index = 0;
539	if (devname[0] != '\0') {
540		struct net_device *dev;
541
542		rcu_read_lock();
543		dev = dev_get_by_name_rcu(net, devname);
544		if (dev)
545			index = dev->ifindex;
546		rcu_read_unlock();
547		ret = -ENODEV;
548		if (!dev)
549			goto out;
550	}
551
552	lock_sock(sk);
553	sk->sk_bound_dev_if = index;
554	sk_dst_reset(sk);
555	release_sock(sk);
556
557	ret = 0;
558
559out:
560#endif
561
562	return ret;
563}
564
565static int sock_getbindtodevice(struct sock *sk, char __user *optval,
566				int __user *optlen, int len)
567{
568	int ret = -ENOPROTOOPT;
569#ifdef CONFIG_NETDEVICES
570	struct net *net = sock_net(sk);
571	char devname[IFNAMSIZ];
572
573	if (sk->sk_bound_dev_if == 0) {
574		len = 0;
575		goto zero;
576	}
577
578	ret = -EINVAL;
579	if (len < IFNAMSIZ)
580		goto out;
581
582	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
583	if (ret)
584		goto out;
585
586	len = strlen(devname) + 1;
587
588	ret = -EFAULT;
589	if (copy_to_user(optval, devname, len))
590		goto out;
591
592zero:
593	ret = -EFAULT;
594	if (put_user(len, optlen))
595		goto out;
596
597	ret = 0;
598
599out:
600#endif
601
602	return ret;
603}
604
605static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
606{
607	if (valbool)
608		sock_set_flag(sk, bit);
609	else
610		sock_reset_flag(sk, bit);
611}
612
613/*
614 *	This is meant for all protocols to use and covers goings on
615 *	at the socket level. Everything here is generic.
616 */
617
618int sock_setsockopt(struct socket *sock, int level, int optname,
619		    char __user *optval, unsigned int optlen)
620{
621	struct sock *sk = sock->sk;
622	int val;
623	int valbool;
624	struct linger ling;
625	int ret = 0;
626
627	/*
628	 *	Options without arguments
629	 */
630
631	if (optname == SO_BINDTODEVICE)
632		return sock_setbindtodevice(sk, optval, optlen);
633
634	if (optlen < sizeof(int))
635		return -EINVAL;
636
637	if (get_user(val, (int __user *)optval))
638		return -EFAULT;
639
640	valbool = val ? 1 : 0;
641
642	lock_sock(sk);
643
644	switch (optname) {
645	case SO_DEBUG:
646		if (val && !capable(CAP_NET_ADMIN))
647			ret = -EACCES;
648		else
649			sock_valbool_flag(sk, SOCK_DBG, valbool);
650		break;
651	case SO_REUSEADDR:
652		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
653		break;
654	case SO_REUSEPORT:
655		sk->sk_reuseport = valbool;
656		break;
657	case SO_TYPE:
658	case SO_PROTOCOL:
659	case SO_DOMAIN:
660	case SO_ERROR:
661		ret = -ENOPROTOOPT;
662		break;
663	case SO_DONTROUTE:
664		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
665		break;
666	case SO_BROADCAST:
667		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
668		break;
669	case SO_SNDBUF:
670		/* Don't error on this BSD doesn't and if you think
671		 * about it this is right. Otherwise apps have to
672		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
673		 * are treated in BSD as hints
674		 */
675		val = min_t(u32, val, sysctl_wmem_max);
676set_sndbuf:
677		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
678		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
679		/* Wake up sending tasks if we upped the value. */
680		sk->sk_write_space(sk);
681		break;
682
683	case SO_SNDBUFFORCE:
684		if (!capable(CAP_NET_ADMIN)) {
685			ret = -EPERM;
686			break;
687		}
688		goto set_sndbuf;
689
690	case SO_RCVBUF:
691		/* Don't error on this BSD doesn't and if you think
692		 * about it this is right. Otherwise apps have to
693		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
694		 * are treated in BSD as hints
695		 */
696		val = min_t(u32, val, sysctl_rmem_max);
697set_rcvbuf:
698		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
699		/*
700		 * We double it on the way in to account for
701		 * "struct sk_buff" etc. overhead.   Applications
702		 * assume that the SO_RCVBUF setting they make will
703		 * allow that much actual data to be received on that
704		 * socket.
705		 *
706		 * Applications are unaware that "struct sk_buff" and
707		 * other overheads allocate from the receive buffer
708		 * during socket buffer allocation.
709		 *
710		 * And after considering the possible alternatives,
711		 * returning the value we actually used in getsockopt
712		 * is the most desirable behavior.
713		 */
714		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
715		break;
716
717	case SO_RCVBUFFORCE:
718		if (!capable(CAP_NET_ADMIN)) {
719			ret = -EPERM;
720			break;
721		}
722		goto set_rcvbuf;
723
724	case SO_KEEPALIVE:
725#ifdef CONFIG_INET
726		if (sk->sk_protocol == IPPROTO_TCP &&
727		    sk->sk_type == SOCK_STREAM)
728			tcp_set_keepalive(sk, valbool);
729#endif
730		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
731		break;
732
733	case SO_OOBINLINE:
734		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
735		break;
736
737	case SO_NO_CHECK:
738		sk->sk_no_check = valbool;
739		break;
740
741	case SO_PRIORITY:
742		if ((val >= 0 && val <= 6) ||
743		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
744			sk->sk_priority = val;
745		else
746			ret = -EPERM;
747		break;
748
749	case SO_LINGER:
750		if (optlen < sizeof(ling)) {
751			ret = -EINVAL;	/* 1003.1g */
752			break;
753		}
754		if (copy_from_user(&ling, optval, sizeof(ling))) {
755			ret = -EFAULT;
756			break;
757		}
758		if (!ling.l_onoff)
759			sock_reset_flag(sk, SOCK_LINGER);
760		else {
761#if (BITS_PER_LONG == 32)
762			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
763				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
764			else
765#endif
766				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
767			sock_set_flag(sk, SOCK_LINGER);
768		}
769		break;
770
771	case SO_BSDCOMPAT:
772		sock_warn_obsolete_bsdism("setsockopt");
773		break;
774
775	case SO_PASSCRED:
776		if (valbool)
777			set_bit(SOCK_PASSCRED, &sock->flags);
778		else
779			clear_bit(SOCK_PASSCRED, &sock->flags);
780		break;
781
782	case SO_TIMESTAMP:
783	case SO_TIMESTAMPNS:
784		if (valbool)  {
785			if (optname == SO_TIMESTAMP)
786				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
787			else
788				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
789			sock_set_flag(sk, SOCK_RCVTSTAMP);
790			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
791		} else {
792			sock_reset_flag(sk, SOCK_RCVTSTAMP);
793			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
794		}
795		break;
796
797	case SO_TIMESTAMPING:
798		if (val & ~SOF_TIMESTAMPING_MASK) {
799			ret = -EINVAL;
800			break;
801		}
802		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
803				  val & SOF_TIMESTAMPING_TX_HARDWARE);
804		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
805				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
806		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
807				  val & SOF_TIMESTAMPING_RX_HARDWARE);
808		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
809			sock_enable_timestamp(sk,
810					      SOCK_TIMESTAMPING_RX_SOFTWARE);
811		else
812			sock_disable_timestamp(sk,
813					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
814		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
815				  val & SOF_TIMESTAMPING_SOFTWARE);
816		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
817				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
818		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
819				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
820		break;
821
822	case SO_RCVLOWAT:
823		if (val < 0)
824			val = INT_MAX;
825		sk->sk_rcvlowat = val ? : 1;
826		break;
827
828	case SO_RCVTIMEO:
829		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
830		break;
831
832	case SO_SNDTIMEO:
833		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
834		break;
835
836	case SO_ATTACH_FILTER:
837		ret = -EINVAL;
838		if (optlen == sizeof(struct sock_fprog)) {
839			struct sock_fprog fprog;
840
841			ret = -EFAULT;
842			if (copy_from_user(&fprog, optval, sizeof(fprog)))
843				break;
844
845			ret = sk_attach_filter(&fprog, sk);
846		}
847		break;
848
849	case SO_DETACH_FILTER:
850		ret = sk_detach_filter(sk);
851		break;
852
853	case SO_LOCK_FILTER:
854		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
855			ret = -EPERM;
856		else
857			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
858		break;
859
860	case SO_PASSSEC:
861		if (valbool)
862			set_bit(SOCK_PASSSEC, &sock->flags);
863		else
864			clear_bit(SOCK_PASSSEC, &sock->flags);
865		break;
866	case SO_MARK:
867		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
868			ret = -EPERM;
869		else
870			sk->sk_mark = val;
871		break;
872
873		/* We implement the SO_SNDLOWAT etc to
874		   not be settable (1003.1g 5.3) */
875	case SO_RXQ_OVFL:
876		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
877		break;
878
879	case SO_WIFI_STATUS:
880		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
881		break;
882
883	case SO_PEEK_OFF:
884		if (sock->ops->set_peek_off)
885			sock->ops->set_peek_off(sk, val);
886		else
887			ret = -EOPNOTSUPP;
888		break;
889
890	case SO_NOFCS:
891		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
892		break;
893
894	case SO_SELECT_ERR_QUEUE:
895		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
896		break;
897
898#ifdef CONFIG_NET_RX_BUSY_POLL
899	case SO_BUSY_POLL:
900		/* allow unprivileged users to decrease the value */
901		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
902			ret = -EPERM;
903		else {
904			if (val < 0)
905				ret = -EINVAL;
906			else
907				sk->sk_ll_usec = val;
908		}
909		break;
910#endif
911
912	case SO_MAX_PACING_RATE:
913		sk->sk_max_pacing_rate = val;
914		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
915					 sk->sk_max_pacing_rate);
916		break;
917
918	default:
919		ret = -ENOPROTOOPT;
920		break;
921	}
922	release_sock(sk);
923	return ret;
924}
925EXPORT_SYMBOL(sock_setsockopt);
926
927
928void cred_to_ucred(struct pid *pid, const struct cred *cred,
929		   struct ucred *ucred)
930{
931	ucred->pid = pid_vnr(pid);
932	ucred->uid = ucred->gid = -1;
933	if (cred) {
934		struct user_namespace *current_ns = current_user_ns();
935
936		ucred->uid = from_kuid_munged(current_ns, cred->euid);
937		ucred->gid = from_kgid_munged(current_ns, cred->egid);
938	}
939}
940EXPORT_SYMBOL_GPL(cred_to_ucred);
941
942int sock_getsockopt(struct socket *sock, int level, int optname,
943		    char __user *optval, int __user *optlen)
944{
945	struct sock *sk = sock->sk;
946
947	union {
948		int val;
949		struct linger ling;
950		struct timeval tm;
951	} v;
952
953	int lv = sizeof(int);
954	int len;
955
956	if (get_user(len, optlen))
957		return -EFAULT;
958	if (len < 0)
959		return -EINVAL;
960
961	memset(&v, 0, sizeof(v));
962
963	switch (optname) {
964	case SO_DEBUG:
965		v.val = sock_flag(sk, SOCK_DBG);
966		break;
967
968	case SO_DONTROUTE:
969		v.val = sock_flag(sk, SOCK_LOCALROUTE);
970		break;
971
972	case SO_BROADCAST:
973		v.val = sock_flag(sk, SOCK_BROADCAST);
974		break;
975
976	case SO_SNDBUF:
977		v.val = sk->sk_sndbuf;
978		break;
979
980	case SO_RCVBUF:
981		v.val = sk->sk_rcvbuf;
982		break;
983
984	case SO_REUSEADDR:
985		v.val = sk->sk_reuse;
986		break;
987
988	case SO_REUSEPORT:
989		v.val = sk->sk_reuseport;
990		break;
991
992	case SO_KEEPALIVE:
993		v.val = sock_flag(sk, SOCK_KEEPOPEN);
994		break;
995
996	case SO_TYPE:
997		v.val = sk->sk_type;
998		break;
999
1000	case SO_PROTOCOL:
1001		v.val = sk->sk_protocol;
1002		break;
1003
1004	case SO_DOMAIN:
1005		v.val = sk->sk_family;
1006		break;
1007
1008	case SO_ERROR:
1009		v.val = -sock_error(sk);
1010		if (v.val == 0)
1011			v.val = xchg(&sk->sk_err_soft, 0);
1012		break;
1013
1014	case SO_OOBINLINE:
1015		v.val = sock_flag(sk, SOCK_URGINLINE);
1016		break;
1017
1018	case SO_NO_CHECK:
1019		v.val = sk->sk_no_check;
1020		break;
1021
1022	case SO_PRIORITY:
1023		v.val = sk->sk_priority;
1024		break;
1025
1026	case SO_LINGER:
1027		lv		= sizeof(v.ling);
1028		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1029		v.ling.l_linger	= sk->sk_lingertime / HZ;
1030		break;
1031
1032	case SO_BSDCOMPAT:
1033		sock_warn_obsolete_bsdism("getsockopt");
1034		break;
1035
1036	case SO_TIMESTAMP:
1037		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1038				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1039		break;
1040
1041	case SO_TIMESTAMPNS:
1042		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1043		break;
1044
1045	case SO_TIMESTAMPING:
1046		v.val = 0;
1047		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
1048			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
1049		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
1050			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
1051		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
1052			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
1053		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1054			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
1055		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
1056			v.val |= SOF_TIMESTAMPING_SOFTWARE;
1057		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
1058			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
1059		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
1060			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
1061		break;
1062
1063	case SO_RCVTIMEO:
1064		lv = sizeof(struct timeval);
1065		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1066			v.tm.tv_sec = 0;
1067			v.tm.tv_usec = 0;
1068		} else {
1069			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1070			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1071		}
1072		break;
1073
1074	case SO_SNDTIMEO:
1075		lv = sizeof(struct timeval);
1076		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1077			v.tm.tv_sec = 0;
1078			v.tm.tv_usec = 0;
1079		} else {
1080			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1081			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1082		}
1083		break;
1084
1085	case SO_RCVLOWAT:
1086		v.val = sk->sk_rcvlowat;
1087		break;
1088
1089	case SO_SNDLOWAT:
1090		v.val = 1;
1091		break;
1092
1093	case SO_PASSCRED:
1094		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1095		break;
1096
1097	case SO_PEERCRED:
1098	{
1099		struct ucred peercred;
1100		if (len > sizeof(peercred))
1101			len = sizeof(peercred);
1102		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1103		if (copy_to_user(optval, &peercred, len))
1104			return -EFAULT;
1105		goto lenout;
1106	}
1107
1108	case SO_PEERNAME:
1109	{
1110		char address[128];
1111
1112		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1113			return -ENOTCONN;
1114		if (lv < len)
1115			return -EINVAL;
1116		if (copy_to_user(optval, address, len))
1117			return -EFAULT;
1118		goto lenout;
1119	}
1120
1121	/* Dubious BSD thing... Probably nobody even uses it, but
1122	 * the UNIX standard wants it for whatever reason... -DaveM
1123	 */
1124	case SO_ACCEPTCONN:
1125		v.val = sk->sk_state == TCP_LISTEN;
1126		break;
1127
1128	case SO_PASSSEC:
1129		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1130		break;
1131
1132	case SO_PEERSEC:
1133		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1134
1135	case SO_MARK:
1136		v.val = sk->sk_mark;
1137		break;
1138
1139	case SO_RXQ_OVFL:
1140		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1141		break;
1142
1143	case SO_WIFI_STATUS:
1144		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1145		break;
1146
1147	case SO_PEEK_OFF:
1148		if (!sock->ops->set_peek_off)
1149			return -EOPNOTSUPP;
1150
1151		v.val = sk->sk_peek_off;
1152		break;
1153	case SO_NOFCS:
1154		v.val = sock_flag(sk, SOCK_NOFCS);
1155		break;
1156
1157	case SO_BINDTODEVICE:
1158		return sock_getbindtodevice(sk, optval, optlen, len);
1159
1160	case SO_GET_FILTER:
1161		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1162		if (len < 0)
1163			return len;
1164
1165		goto lenout;
1166
1167	case SO_LOCK_FILTER:
1168		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1169		break;
1170
1171	case SO_SELECT_ERR_QUEUE:
1172		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1173		break;
1174
1175#ifdef CONFIG_NET_RX_BUSY_POLL
1176	case SO_BUSY_POLL:
1177		v.val = sk->sk_ll_usec;
1178		break;
1179#endif
1180
1181	case SO_MAX_PACING_RATE:
1182		v.val = sk->sk_max_pacing_rate;
1183		break;
1184
1185	default:
1186		return -ENOPROTOOPT;
1187	}
1188
1189	if (len > lv)
1190		len = lv;
1191	if (copy_to_user(optval, &v, len))
1192		return -EFAULT;
1193lenout:
1194	if (put_user(len, optlen))
1195		return -EFAULT;
1196	return 0;
1197}
1198
1199/*
1200 * Initialize an sk_lock.
1201 *
1202 * (We also register the sk_lock with the lock validator.)
1203 */
1204static inline void sock_lock_init(struct sock *sk)
1205{
1206	sock_lock_init_class_and_name(sk,
1207			af_family_slock_key_strings[sk->sk_family],
1208			af_family_slock_keys + sk->sk_family,
1209			af_family_key_strings[sk->sk_family],
1210			af_family_keys + sk->sk_family);
1211}
1212
1213/*
1214 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1215 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1216 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1217 */
1218static void sock_copy(struct sock *nsk, const struct sock *osk)
1219{
1220#ifdef CONFIG_SECURITY_NETWORK
1221	void *sptr = nsk->sk_security;
1222#endif
1223	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1224
1225	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1226	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1227
1228#ifdef CONFIG_SECURITY_NETWORK
1229	nsk->sk_security = sptr;
1230	security_sk_clone(osk, nsk);
1231#endif
1232}
1233
1234void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1235{
1236	unsigned long nulls1, nulls2;
1237
1238	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1239	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1240	if (nulls1 > nulls2)
1241		swap(nulls1, nulls2);
1242
1243	if (nulls1 != 0)
1244		memset((char *)sk, 0, nulls1);
1245	memset((char *)sk + nulls1 + sizeof(void *), 0,
1246	       nulls2 - nulls1 - sizeof(void *));
1247	memset((char *)sk + nulls2 + sizeof(void *), 0,
1248	       size - nulls2 - sizeof(void *));
1249}
1250EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1251
1252static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1253		int family)
1254{
1255	struct sock *sk;
1256	struct kmem_cache *slab;
1257
1258	slab = prot->slab;
1259	if (slab != NULL) {
1260		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1261		if (!sk)
1262			return sk;
1263		if (priority & __GFP_ZERO) {
1264			if (prot->clear_sk)
1265				prot->clear_sk(sk, prot->obj_size);
1266			else
1267				sk_prot_clear_nulls(sk, prot->obj_size);
1268		}
1269	} else
1270		sk = kmalloc(prot->obj_size, priority);
1271
1272	if (sk != NULL) {
1273		kmemcheck_annotate_bitfield(sk, flags);
1274
1275		if (security_sk_alloc(sk, family, priority))
1276			goto out_free;
1277
1278		if (!try_module_get(prot->owner))
1279			goto out_free_sec;
1280		sk_tx_queue_clear(sk);
1281	}
1282
1283	return sk;
1284
1285out_free_sec:
1286	security_sk_free(sk);
1287out_free:
1288	if (slab != NULL)
1289		kmem_cache_free(slab, sk);
1290	else
1291		kfree(sk);
1292	return NULL;
1293}
1294
1295static void sk_prot_free(struct proto *prot, struct sock *sk)
1296{
1297	struct kmem_cache *slab;
1298	struct module *owner;
1299
1300	owner = prot->owner;
1301	slab = prot->slab;
1302
1303	security_sk_free(sk);
1304	if (slab != NULL)
1305		kmem_cache_free(slab, sk);
1306	else
1307		kfree(sk);
1308	module_put(owner);
1309}
1310
1311#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
1312void sock_update_netprioidx(struct sock *sk)
1313{
1314	if (in_interrupt())
1315		return;
1316
1317	sk->sk_cgrp_prioidx = task_netprioidx(current);
1318}
1319EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1320#endif
1321
1322/**
1323 *	sk_alloc - All socket objects are allocated here
1324 *	@net: the applicable net namespace
1325 *	@family: protocol family
1326 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1327 *	@prot: struct proto associated with this new sock instance
1328 */
1329struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1330		      struct proto *prot)
1331{
1332	struct sock *sk;
1333
1334	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1335	if (sk) {
1336		sk->sk_family = family;
1337		/*
1338		 * See comment in struct sock definition to understand
1339		 * why we need sk_prot_creator -acme
1340		 */
1341		sk->sk_prot = sk->sk_prot_creator = prot;
1342		sock_lock_init(sk);
1343		sock_net_set(sk, get_net(net));
1344		atomic_set(&sk->sk_wmem_alloc, 1);
1345
1346		sock_update_classid(sk);
1347		sock_update_netprioidx(sk);
1348	}
1349
1350	return sk;
1351}
1352EXPORT_SYMBOL(sk_alloc);
1353
1354static void __sk_free(struct sock *sk)
1355{
1356	struct sk_filter *filter;
1357
1358	if (sk->sk_destruct)
1359		sk->sk_destruct(sk);
1360
1361	filter = rcu_dereference_check(sk->sk_filter,
1362				       atomic_read(&sk->sk_wmem_alloc) == 0);
1363	if (filter) {
1364		sk_filter_uncharge(sk, filter);
1365		RCU_INIT_POINTER(sk->sk_filter, NULL);
1366	}
1367
1368	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1369
1370	if (atomic_read(&sk->sk_omem_alloc))
1371		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1372			 __func__, atomic_read(&sk->sk_omem_alloc));
1373
1374	if (sk->sk_peer_cred)
1375		put_cred(sk->sk_peer_cred);
1376	put_pid(sk->sk_peer_pid);
1377	put_net(sock_net(sk));
1378	sk_prot_free(sk->sk_prot_creator, sk);
1379}
1380
1381void sk_free(struct sock *sk)
1382{
1383	/*
1384	 * We subtract one from sk_wmem_alloc and can know if
1385	 * some packets are still in some tx queue.
1386	 * If not null, sock_wfree() will call __sk_free(sk) later
1387	 */
1388	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1389		__sk_free(sk);
1390}
1391EXPORT_SYMBOL(sk_free);
1392
1393/*
1394 * Last sock_put should drop reference to sk->sk_net. It has already
1395 * been dropped in sk_change_net. Taking reference to stopping namespace
1396 * is not an option.
1397 * Take reference to a socket to remove it from hash _alive_ and after that
1398 * destroy it in the context of init_net.
1399 */
1400void sk_release_kernel(struct sock *sk)
1401{
1402	if (sk == NULL || sk->sk_socket == NULL)
1403		return;
1404
1405	sock_hold(sk);
1406	sock_release(sk->sk_socket);
1407	release_net(sock_net(sk));
1408	sock_net_set(sk, get_net(&init_net));
1409	sock_put(sk);
1410}
1411EXPORT_SYMBOL(sk_release_kernel);
1412
1413static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1414{
1415	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1416		sock_update_memcg(newsk);
1417}
1418
1419/**
1420 *	sk_clone_lock - clone a socket, and lock its clone
1421 *	@sk: the socket to clone
1422 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1423 *
1424 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1425 */
1426struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1427{
1428	struct sock *newsk;
1429
1430	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1431	if (newsk != NULL) {
1432		struct sk_filter *filter;
1433
1434		sock_copy(newsk, sk);
1435
1436		/* SANITY */
1437		get_net(sock_net(newsk));
1438		sk_node_init(&newsk->sk_node);
1439		sock_lock_init(newsk);
1440		bh_lock_sock(newsk);
1441		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1442		newsk->sk_backlog.len = 0;
1443
1444		atomic_set(&newsk->sk_rmem_alloc, 0);
1445		/*
1446		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1447		 */
1448		atomic_set(&newsk->sk_wmem_alloc, 1);
1449		atomic_set(&newsk->sk_omem_alloc, 0);
1450		skb_queue_head_init(&newsk->sk_receive_queue);
1451		skb_queue_head_init(&newsk->sk_write_queue);
1452#ifdef CONFIG_NET_DMA
1453		skb_queue_head_init(&newsk->sk_async_wait_queue);
1454#endif
1455
1456		spin_lock_init(&newsk->sk_dst_lock);
1457		rwlock_init(&newsk->sk_callback_lock);
1458		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1459				af_callback_keys + newsk->sk_family,
1460				af_family_clock_key_strings[newsk->sk_family]);
1461
1462		newsk->sk_dst_cache	= NULL;
1463		newsk->sk_wmem_queued	= 0;
1464		newsk->sk_forward_alloc = 0;
1465		newsk->sk_send_head	= NULL;
1466		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1467
1468		sock_reset_flag(newsk, SOCK_DONE);
1469		skb_queue_head_init(&newsk->sk_error_queue);
1470
1471		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1472		if (filter != NULL)
1473			sk_filter_charge(newsk, filter);
1474
1475		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1476			/* It is still raw copy of parent, so invalidate
1477			 * destructor and make plain sk_free() */
1478			newsk->sk_destruct = NULL;
1479			bh_unlock_sock(newsk);
1480			sk_free(newsk);
1481			newsk = NULL;
1482			goto out;
1483		}
1484
1485		newsk->sk_err	   = 0;
1486		newsk->sk_priority = 0;
1487		/*
1488		 * Before updating sk_refcnt, we must commit prior changes to memory
1489		 * (Documentation/RCU/rculist_nulls.txt for details)
1490		 */
1491		smp_wmb();
1492		atomic_set(&newsk->sk_refcnt, 2);
1493
1494		/*
1495		 * Increment the counter in the same struct proto as the master
1496		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1497		 * is the same as sk->sk_prot->socks, as this field was copied
1498		 * with memcpy).
1499		 *
1500		 * This _changes_ the previous behaviour, where
1501		 * tcp_create_openreq_child always was incrementing the
1502		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1503		 * to be taken into account in all callers. -acme
1504		 */
1505		sk_refcnt_debug_inc(newsk);
1506		sk_set_socket(newsk, NULL);
1507		newsk->sk_wq = NULL;
1508
1509		sk_update_clone(sk, newsk);
1510
1511		if (newsk->sk_prot->sockets_allocated)
1512			sk_sockets_allocated_inc(newsk);
1513
1514		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1515			net_enable_timestamp();
1516	}
1517out:
1518	return newsk;
1519}
1520EXPORT_SYMBOL_GPL(sk_clone_lock);
1521
1522void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1523{
1524	__sk_dst_set(sk, dst);
1525	sk->sk_route_caps = dst->dev->features;
1526	if (sk->sk_route_caps & NETIF_F_GSO)
1527		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1528	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1529	if (sk_can_gso(sk)) {
1530		if (dst->header_len) {
1531			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1532		} else {
1533			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1534			sk->sk_gso_max_size = dst->dev->gso_max_size;
1535			sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1536		}
1537	}
1538}
1539EXPORT_SYMBOL_GPL(sk_setup_caps);
1540
1541/*
1542 *	Simple resource managers for sockets.
1543 */
1544
1545
1546/*
1547 * Write buffer destructor automatically called from kfree_skb.
1548 */
1549void sock_wfree(struct sk_buff *skb)
1550{
1551	struct sock *sk = skb->sk;
1552	unsigned int len = skb->truesize;
1553
1554	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1555		/*
1556		 * Keep a reference on sk_wmem_alloc, this will be released
1557		 * after sk_write_space() call
1558		 */
1559		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1560		sk->sk_write_space(sk);
1561		len = 1;
1562	}
1563	/*
1564	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1565	 * could not do because of in-flight packets
1566	 */
1567	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1568		__sk_free(sk);
1569}
1570EXPORT_SYMBOL(sock_wfree);
1571
1572void skb_orphan_partial(struct sk_buff *skb)
1573{
1574	/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1575	 * so we do not completely orphan skb, but transfert all
1576	 * accounted bytes but one, to avoid unexpected reorders.
1577	 */
1578	if (skb->destructor == sock_wfree
1579#ifdef CONFIG_INET
1580	    || skb->destructor == tcp_wfree
1581#endif
1582		) {
1583		atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1584		skb->truesize = 1;
1585	} else {
1586		skb_orphan(skb);
1587	}
1588}
1589EXPORT_SYMBOL(skb_orphan_partial);
1590
1591/*
1592 * Read buffer destructor automatically called from kfree_skb.
1593 */
1594void sock_rfree(struct sk_buff *skb)
1595{
1596	struct sock *sk = skb->sk;
1597	unsigned int len = skb->truesize;
1598
1599	atomic_sub(len, &sk->sk_rmem_alloc);
1600	sk_mem_uncharge(sk, len);
1601}
1602EXPORT_SYMBOL(sock_rfree);
1603
1604void sock_edemux(struct sk_buff *skb)
1605{
1606	struct sock *sk = skb->sk;
1607
1608#ifdef CONFIG_INET
1609	if (sk->sk_state == TCP_TIME_WAIT)
1610		inet_twsk_put(inet_twsk(sk));
1611	else
1612#endif
1613		sock_put(sk);
1614}
1615EXPORT_SYMBOL(sock_edemux);
1616
1617kuid_t sock_i_uid(struct sock *sk)
1618{
1619	kuid_t uid;
1620
1621	read_lock_bh(&sk->sk_callback_lock);
1622	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1623	read_unlock_bh(&sk->sk_callback_lock);
1624	return uid;
1625}
1626EXPORT_SYMBOL(sock_i_uid);
1627
1628unsigned long sock_i_ino(struct sock *sk)
1629{
1630	unsigned long ino;
1631
1632	read_lock_bh(&sk->sk_callback_lock);
1633	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1634	read_unlock_bh(&sk->sk_callback_lock);
1635	return ino;
1636}
1637EXPORT_SYMBOL(sock_i_ino);
1638
1639/*
1640 * Allocate a skb from the socket's send buffer.
1641 */
1642struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1643			     gfp_t priority)
1644{
1645	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1646		struct sk_buff *skb = alloc_skb(size, priority);
1647		if (skb) {
1648			skb_set_owner_w(skb, sk);
1649			return skb;
1650		}
1651	}
1652	return NULL;
1653}
1654EXPORT_SYMBOL(sock_wmalloc);
1655
1656/*
1657 * Allocate a skb from the socket's receive buffer.
1658 */
1659struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1660			     gfp_t priority)
1661{
1662	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1663		struct sk_buff *skb = alloc_skb(size, priority);
1664		if (skb) {
1665			skb_set_owner_r(skb, sk);
1666			return skb;
1667		}
1668	}
1669	return NULL;
1670}
1671
1672/*
1673 * Allocate a memory block from the socket's option memory buffer.
1674 */
1675void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1676{
1677	if ((unsigned int)size <= sysctl_optmem_max &&
1678	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1679		void *mem;
1680		/* First do the add, to avoid the race if kmalloc
1681		 * might sleep.
1682		 */
1683		atomic_add(size, &sk->sk_omem_alloc);
1684		mem = kmalloc(size, priority);
1685		if (mem)
1686			return mem;
1687		atomic_sub(size, &sk->sk_omem_alloc);
1688	}
1689	return NULL;
1690}
1691EXPORT_SYMBOL(sock_kmalloc);
1692
1693/*
1694 * Free an option memory block.
1695 */
1696void sock_kfree_s(struct sock *sk, void *mem, int size)
1697{
1698	kfree(mem);
1699	atomic_sub(size, &sk->sk_omem_alloc);
1700}
1701EXPORT_SYMBOL(sock_kfree_s);
1702
1703/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1704   I think, these locks should be removed for datagram sockets.
1705 */
1706static long sock_wait_for_wmem(struct sock *sk, long timeo)
1707{
1708	DEFINE_WAIT(wait);
1709
1710	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1711	for (;;) {
1712		if (!timeo)
1713			break;
1714		if (signal_pending(current))
1715			break;
1716		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1717		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1718		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1719			break;
1720		if (sk->sk_shutdown & SEND_SHUTDOWN)
1721			break;
1722		if (sk->sk_err)
1723			break;
1724		timeo = schedule_timeout(timeo);
1725	}
1726	finish_wait(sk_sleep(sk), &wait);
1727	return timeo;
1728}
1729
1730
1731/*
1732 *	Generic send/receive buffer handlers
1733 */
1734
1735struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1736				     unsigned long data_len, int noblock,
1737				     int *errcode, int max_page_order)
1738{
1739	struct sk_buff *skb = NULL;
1740	unsigned long chunk;
1741	gfp_t gfp_mask;
1742	long timeo;
1743	int err;
1744	int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1745	struct page *page;
1746	int i;
1747
1748	err = -EMSGSIZE;
1749	if (npages > MAX_SKB_FRAGS)
1750		goto failure;
1751
1752	timeo = sock_sndtimeo(sk, noblock);
1753	while (!skb) {
1754		err = sock_error(sk);
1755		if (err != 0)
1756			goto failure;
1757
1758		err = -EPIPE;
1759		if (sk->sk_shutdown & SEND_SHUTDOWN)
1760			goto failure;
1761
1762		if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) {
1763			set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1764			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1765			err = -EAGAIN;
1766			if (!timeo)
1767				goto failure;
1768			if (signal_pending(current))
1769				goto interrupted;
1770			timeo = sock_wait_for_wmem(sk, timeo);
1771			continue;
1772		}
1773
1774		err = -ENOBUFS;
1775		gfp_mask = sk->sk_allocation;
1776		if (gfp_mask & __GFP_WAIT)
1777			gfp_mask |= __GFP_REPEAT;
1778
1779		skb = alloc_skb(header_len, gfp_mask);
1780		if (!skb)
1781			goto failure;
1782
1783		skb->truesize += data_len;
1784
1785		for (i = 0; npages > 0; i++) {
1786			int order = max_page_order;
1787
1788			while (order) {
1789				if (npages >= 1 << order) {
1790					page = alloc_pages(sk->sk_allocation |
1791							   __GFP_COMP | __GFP_NOWARN,
1792							   order);
1793					if (page)
1794						goto fill_page;
1795				}
1796				order--;
1797			}
1798			page = alloc_page(sk->sk_allocation);
1799			if (!page)
1800				goto failure;
1801fill_page:
1802			chunk = min_t(unsigned long, data_len,
1803				      PAGE_SIZE << order);
1804			skb_fill_page_desc(skb, i, page, 0, chunk);
1805			data_len -= chunk;
1806			npages -= 1 << order;
1807		}
1808	}
1809
1810	skb_set_owner_w(skb, sk);
1811	return skb;
1812
1813interrupted:
1814	err = sock_intr_errno(timeo);
1815failure:
1816	kfree_skb(skb);
1817	*errcode = err;
1818	return NULL;
1819}
1820EXPORT_SYMBOL(sock_alloc_send_pskb);
1821
1822struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1823				    int noblock, int *errcode)
1824{
1825	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1826}
1827EXPORT_SYMBOL(sock_alloc_send_skb);
1828
1829/* On 32bit arches, an skb frag is limited to 2^15 */
1830#define SKB_FRAG_PAGE_ORDER	get_order(32768)
1831
1832/**
1833 * skb_page_frag_refill - check that a page_frag contains enough room
1834 * @sz: minimum size of the fragment we want to get
1835 * @pfrag: pointer to page_frag
1836 * @prio: priority for memory allocation
1837 *
1838 * Note: While this allocator tries to use high order pages, there is
1839 * no guarantee that allocations succeed. Therefore, @sz MUST be
1840 * less or equal than PAGE_SIZE.
1841 */
1842bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
1843{
1844	int order;
1845
1846	if (pfrag->page) {
1847		if (atomic_read(&pfrag->page->_count) == 1) {
1848			pfrag->offset = 0;
1849			return true;
1850		}
1851		if (pfrag->offset + sz <= pfrag->size)
1852			return true;
1853		put_page(pfrag->page);
1854	}
1855
1856	/* We restrict high order allocations to users that can afford to wait */
1857	order = (prio & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1858
1859	do {
1860		gfp_t gfp = prio;
1861
1862		if (order)
1863			gfp |= __GFP_COMP | __GFP_NOWARN;
1864		pfrag->page = alloc_pages(gfp, order);
1865		if (likely(pfrag->page)) {
1866			pfrag->offset = 0;
1867			pfrag->size = PAGE_SIZE << order;
1868			return true;
1869		}
1870	} while (--order >= 0);
1871
1872	return false;
1873}
1874EXPORT_SYMBOL(skb_page_frag_refill);
1875
1876bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1877{
1878	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1879		return true;
1880
1881	sk_enter_memory_pressure(sk);
1882	sk_stream_moderate_sndbuf(sk);
1883	return false;
1884}
1885EXPORT_SYMBOL(sk_page_frag_refill);
1886
1887static void __lock_sock(struct sock *sk)
1888	__releases(&sk->sk_lock.slock)
1889	__acquires(&sk->sk_lock.slock)
1890{
1891	DEFINE_WAIT(wait);
1892
1893	for (;;) {
1894		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1895					TASK_UNINTERRUPTIBLE);
1896		spin_unlock_bh(&sk->sk_lock.slock);
1897		schedule();
1898		spin_lock_bh(&sk->sk_lock.slock);
1899		if (!sock_owned_by_user(sk))
1900			break;
1901	}
1902	finish_wait(&sk->sk_lock.wq, &wait);
1903}
1904
1905static void __release_sock(struct sock *sk)
1906	__releases(&sk->sk_lock.slock)
1907	__acquires(&sk->sk_lock.slock)
1908{
1909	struct sk_buff *skb = sk->sk_backlog.head;
1910
1911	do {
1912		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1913		bh_unlock_sock(sk);
1914
1915		do {
1916			struct sk_buff *next = skb->next;
1917
1918			prefetch(next);
1919			WARN_ON_ONCE(skb_dst_is_noref(skb));
1920			skb->next = NULL;
1921			sk_backlog_rcv(sk, skb);
1922
1923			/*
1924			 * We are in process context here with softirqs
1925			 * disabled, use cond_resched_softirq() to preempt.
1926			 * This is safe to do because we've taken the backlog
1927			 * queue private:
1928			 */
1929			cond_resched_softirq();
1930
1931			skb = next;
1932		} while (skb != NULL);
1933
1934		bh_lock_sock(sk);
1935	} while ((skb = sk->sk_backlog.head) != NULL);
1936
1937	/*
1938	 * Doing the zeroing here guarantee we can not loop forever
1939	 * while a wild producer attempts to flood us.
1940	 */
1941	sk->sk_backlog.len = 0;
1942}
1943
1944/**
1945 * sk_wait_data - wait for data to arrive at sk_receive_queue
1946 * @sk:    sock to wait on
1947 * @timeo: for how long
1948 *
1949 * Now socket state including sk->sk_err is changed only under lock,
1950 * hence we may omit checks after joining wait queue.
1951 * We check receive queue before schedule() only as optimization;
1952 * it is very likely that release_sock() added new data.
1953 */
1954int sk_wait_data(struct sock *sk, long *timeo)
1955{
1956	int rc;
1957	DEFINE_WAIT(wait);
1958
1959	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1960	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1961	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1962	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1963	finish_wait(sk_sleep(sk), &wait);
1964	return rc;
1965}
1966EXPORT_SYMBOL(sk_wait_data);
1967
1968/**
1969 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1970 *	@sk: socket
1971 *	@size: memory size to allocate
1972 *	@kind: allocation type
1973 *
1974 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1975 *	rmem allocation. This function assumes that protocols which have
1976 *	memory_pressure use sk_wmem_queued as write buffer accounting.
1977 */
1978int __sk_mem_schedule(struct sock *sk, int size, int kind)
1979{
1980	struct proto *prot = sk->sk_prot;
1981	int amt = sk_mem_pages(size);
1982	long allocated;
1983	int parent_status = UNDER_LIMIT;
1984
1985	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1986
1987	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1988
1989	/* Under limit. */
1990	if (parent_status == UNDER_LIMIT &&
1991			allocated <= sk_prot_mem_limits(sk, 0)) {
1992		sk_leave_memory_pressure(sk);
1993		return 1;
1994	}
1995
1996	/* Under pressure. (we or our parents) */
1997	if ((parent_status > SOFT_LIMIT) ||
1998			allocated > sk_prot_mem_limits(sk, 1))
1999		sk_enter_memory_pressure(sk);
2000
2001	/* Over hard limit (we or our parents) */
2002	if ((parent_status == OVER_LIMIT) ||
2003			(allocated > sk_prot_mem_limits(sk, 2)))
2004		goto suppress_allocation;
2005
2006	/* guarantee minimum buffer size under pressure */
2007	if (kind == SK_MEM_RECV) {
2008		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2009			return 1;
2010
2011	} else { /* SK_MEM_SEND */
2012		if (sk->sk_type == SOCK_STREAM) {
2013			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2014				return 1;
2015		} else if (atomic_read(&sk->sk_wmem_alloc) <
2016			   prot->sysctl_wmem[0])
2017				return 1;
2018	}
2019
2020	if (sk_has_memory_pressure(sk)) {
2021		int alloc;
2022
2023		if (!sk_under_memory_pressure(sk))
2024			return 1;
2025		alloc = sk_sockets_allocated_read_positive(sk);
2026		if (sk_prot_mem_limits(sk, 2) > alloc *
2027		    sk_mem_pages(sk->sk_wmem_queued +
2028				 atomic_read(&sk->sk_rmem_alloc) +
2029				 sk->sk_forward_alloc))
2030			return 1;
2031	}
2032
2033suppress_allocation:
2034
2035	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2036		sk_stream_moderate_sndbuf(sk);
2037
2038		/* Fail only if socket is _under_ its sndbuf.
2039		 * In this case we cannot block, so that we have to fail.
2040		 */
2041		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2042			return 1;
2043	}
2044
2045	trace_sock_exceed_buf_limit(sk, prot, allocated);
2046
2047	/* Alas. Undo changes. */
2048	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2049
2050	sk_memory_allocated_sub(sk, amt);
2051
2052	return 0;
2053}
2054EXPORT_SYMBOL(__sk_mem_schedule);
2055
2056/**
2057 *	__sk_reclaim - reclaim memory_allocated
2058 *	@sk: socket
2059 */
2060void __sk_mem_reclaim(struct sock *sk)
2061{
2062	sk_memory_allocated_sub(sk,
2063				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2064	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2065
2066	if (sk_under_memory_pressure(sk) &&
2067	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2068		sk_leave_memory_pressure(sk);
2069}
2070EXPORT_SYMBOL(__sk_mem_reclaim);
2071
2072
2073/*
2074 * Set of default routines for initialising struct proto_ops when
2075 * the protocol does not support a particular function. In certain
2076 * cases where it makes no sense for a protocol to have a "do nothing"
2077 * function, some default processing is provided.
2078 */
2079
2080int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2081{
2082	return -EOPNOTSUPP;
2083}
2084EXPORT_SYMBOL(sock_no_bind);
2085
2086int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2087		    int len, int flags)
2088{
2089	return -EOPNOTSUPP;
2090}
2091EXPORT_SYMBOL(sock_no_connect);
2092
2093int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2094{
2095	return -EOPNOTSUPP;
2096}
2097EXPORT_SYMBOL(sock_no_socketpair);
2098
2099int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2100{
2101	return -EOPNOTSUPP;
2102}
2103EXPORT_SYMBOL(sock_no_accept);
2104
2105int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2106		    int *len, int peer)
2107{
2108	return -EOPNOTSUPP;
2109}
2110EXPORT_SYMBOL(sock_no_getname);
2111
2112unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2113{
2114	return 0;
2115}
2116EXPORT_SYMBOL(sock_no_poll);
2117
2118int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2119{
2120	return -EOPNOTSUPP;
2121}
2122EXPORT_SYMBOL(sock_no_ioctl);
2123
2124int sock_no_listen(struct socket *sock, int backlog)
2125{
2126	return -EOPNOTSUPP;
2127}
2128EXPORT_SYMBOL(sock_no_listen);
2129
2130int sock_no_shutdown(struct socket *sock, int how)
2131{
2132	return -EOPNOTSUPP;
2133}
2134EXPORT_SYMBOL(sock_no_shutdown);
2135
2136int sock_no_setsockopt(struct socket *sock, int level, int optname,
2137		    char __user *optval, unsigned int optlen)
2138{
2139	return -EOPNOTSUPP;
2140}
2141EXPORT_SYMBOL(sock_no_setsockopt);
2142
2143int sock_no_getsockopt(struct socket *sock, int level, int optname,
2144		    char __user *optval, int __user *optlen)
2145{
2146	return -EOPNOTSUPP;
2147}
2148EXPORT_SYMBOL(sock_no_getsockopt);
2149
2150int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2151		    size_t len)
2152{
2153	return -EOPNOTSUPP;
2154}
2155EXPORT_SYMBOL(sock_no_sendmsg);
2156
2157int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2158		    size_t len, int flags)
2159{
2160	return -EOPNOTSUPP;
2161}
2162EXPORT_SYMBOL(sock_no_recvmsg);
2163
2164int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2165{
2166	/* Mirror missing mmap method error code */
2167	return -ENODEV;
2168}
2169EXPORT_SYMBOL(sock_no_mmap);
2170
2171ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2172{
2173	ssize_t res;
2174	struct msghdr msg = {.msg_flags = flags};
2175	struct kvec iov;
2176	char *kaddr = kmap(page);
2177	iov.iov_base = kaddr + offset;
2178	iov.iov_len = size;
2179	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2180	kunmap(page);
2181	return res;
2182}
2183EXPORT_SYMBOL(sock_no_sendpage);
2184
2185/*
2186 *	Default Socket Callbacks
2187 */
2188
2189static void sock_def_wakeup(struct sock *sk)
2190{
2191	struct socket_wq *wq;
2192
2193	rcu_read_lock();
2194	wq = rcu_dereference(sk->sk_wq);
2195	if (wq_has_sleeper(wq))
2196		wake_up_interruptible_all(&wq->wait);
2197	rcu_read_unlock();
2198}
2199
2200static void sock_def_error_report(struct sock *sk)
2201{
2202	struct socket_wq *wq;
2203
2204	rcu_read_lock();
2205	wq = rcu_dereference(sk->sk_wq);
2206	if (wq_has_sleeper(wq))
2207		wake_up_interruptible_poll(&wq->wait, POLLERR);
2208	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2209	rcu_read_unlock();
2210}
2211
2212static void sock_def_readable(struct sock *sk, int len)
2213{
2214	struct socket_wq *wq;
2215
2216	rcu_read_lock();
2217	wq = rcu_dereference(sk->sk_wq);
2218	if (wq_has_sleeper(wq))
2219		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2220						POLLRDNORM | POLLRDBAND);
2221	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2222	rcu_read_unlock();
2223}
2224
2225static void sock_def_write_space(struct sock *sk)
2226{
2227	struct socket_wq *wq;
2228
2229	rcu_read_lock();
2230
2231	/* Do not wake up a writer until he can make "significant"
2232	 * progress.  --DaveM
2233	 */
2234	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2235		wq = rcu_dereference(sk->sk_wq);
2236		if (wq_has_sleeper(wq))
2237			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2238						POLLWRNORM | POLLWRBAND);
2239
2240		/* Should agree with poll, otherwise some programs break */
2241		if (sock_writeable(sk))
2242			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2243	}
2244
2245	rcu_read_unlock();
2246}
2247
2248static void sock_def_destruct(struct sock *sk)
2249{
2250	kfree(sk->sk_protinfo);
2251}
2252
2253void sk_send_sigurg(struct sock *sk)
2254{
2255	if (sk->sk_socket && sk->sk_socket->file)
2256		if (send_sigurg(&sk->sk_socket->file->f_owner))
2257			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2258}
2259EXPORT_SYMBOL(sk_send_sigurg);
2260
2261void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2262		    unsigned long expires)
2263{
2264	if (!mod_timer(timer, expires))
2265		sock_hold(sk);
2266}
2267EXPORT_SYMBOL(sk_reset_timer);
2268
2269void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2270{
2271	if (del_timer(timer))
2272		__sock_put(sk);
2273}
2274EXPORT_SYMBOL(sk_stop_timer);
2275
2276void sock_init_data(struct socket *sock, struct sock *sk)
2277{
2278	skb_queue_head_init(&sk->sk_receive_queue);
2279	skb_queue_head_init(&sk->sk_write_queue);
2280	skb_queue_head_init(&sk->sk_error_queue);
2281#ifdef CONFIG_NET_DMA
2282	skb_queue_head_init(&sk->sk_async_wait_queue);
2283#endif
2284
2285	sk->sk_send_head	=	NULL;
2286
2287	init_timer(&sk->sk_timer);
2288
2289	sk->sk_allocation	=	GFP_KERNEL;
2290	sk->sk_rcvbuf		=	sysctl_rmem_default;
2291	sk->sk_sndbuf		=	sysctl_wmem_default;
2292	sk->sk_state		=	TCP_CLOSE;
2293	sk_set_socket(sk, sock);
2294
2295	sock_set_flag(sk, SOCK_ZAPPED);
2296
2297	if (sock) {
2298		sk->sk_type	=	sock->type;
2299		sk->sk_wq	=	sock->wq;
2300		sock->sk	=	sk;
2301	} else
2302		sk->sk_wq	=	NULL;
2303
2304	spin_lock_init(&sk->sk_dst_lock);
2305	rwlock_init(&sk->sk_callback_lock);
2306	lockdep_set_class_and_name(&sk->sk_callback_lock,
2307			af_callback_keys + sk->sk_family,
2308			af_family_clock_key_strings[sk->sk_family]);
2309
2310	sk->sk_state_change	=	sock_def_wakeup;
2311	sk->sk_data_ready	=	sock_def_readable;
2312	sk->sk_write_space	=	sock_def_write_space;
2313	sk->sk_error_report	=	sock_def_error_report;
2314	sk->sk_destruct		=	sock_def_destruct;
2315
2316	sk->sk_frag.page	=	NULL;
2317	sk->sk_frag.offset	=	0;
2318	sk->sk_peek_off		=	-1;
2319
2320	sk->sk_peer_pid 	=	NULL;
2321	sk->sk_peer_cred	=	NULL;
2322	sk->sk_write_pending	=	0;
2323	sk->sk_rcvlowat		=	1;
2324	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2325	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2326
2327	sk->sk_stamp = ktime_set(-1L, 0);
2328
2329#ifdef CONFIG_NET_RX_BUSY_POLL
2330	sk->sk_napi_id		=	0;
2331	sk->sk_ll_usec		=	sysctl_net_busy_read;
2332#endif
2333
2334	sk->sk_max_pacing_rate = ~0U;
2335	sk->sk_pacing_rate = ~0U;
2336	/*
2337	 * Before updating sk_refcnt, we must commit prior changes to memory
2338	 * (Documentation/RCU/rculist_nulls.txt for details)
2339	 */
2340	smp_wmb();
2341	atomic_set(&sk->sk_refcnt, 1);
2342	atomic_set(&sk->sk_drops, 0);
2343}
2344EXPORT_SYMBOL(sock_init_data);
2345
2346void lock_sock_nested(struct sock *sk, int subclass)
2347{
2348	might_sleep();
2349	spin_lock_bh(&sk->sk_lock.slock);
2350	if (sk->sk_lock.owned)
2351		__lock_sock(sk);
2352	sk->sk_lock.owned = 1;
2353	spin_unlock(&sk->sk_lock.slock);
2354	/*
2355	 * The sk_lock has mutex_lock() semantics here:
2356	 */
2357	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2358	local_bh_enable();
2359}
2360EXPORT_SYMBOL(lock_sock_nested);
2361
2362void release_sock(struct sock *sk)
2363{
2364	/*
2365	 * The sk_lock has mutex_unlock() semantics:
2366	 */
2367	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2368
2369	spin_lock_bh(&sk->sk_lock.slock);
2370	if (sk->sk_backlog.tail)
2371		__release_sock(sk);
2372
2373	if (sk->sk_prot->release_cb)
2374		sk->sk_prot->release_cb(sk);
2375
2376	sk->sk_lock.owned = 0;
2377	if (waitqueue_active(&sk->sk_lock.wq))
2378		wake_up(&sk->sk_lock.wq);
2379	spin_unlock_bh(&sk->sk_lock.slock);
2380}
2381EXPORT_SYMBOL(release_sock);
2382
2383/**
2384 * lock_sock_fast - fast version of lock_sock
2385 * @sk: socket
2386 *
2387 * This version should be used for very small section, where process wont block
2388 * return false if fast path is taken
2389 *   sk_lock.slock locked, owned = 0, BH disabled
2390 * return true if slow path is taken
2391 *   sk_lock.slock unlocked, owned = 1, BH enabled
2392 */
2393bool lock_sock_fast(struct sock *sk)
2394{
2395	might_sleep();
2396	spin_lock_bh(&sk->sk_lock.slock);
2397
2398	if (!sk->sk_lock.owned)
2399		/*
2400		 * Note : We must disable BH
2401		 */
2402		return false;
2403
2404	__lock_sock(sk);
2405	sk->sk_lock.owned = 1;
2406	spin_unlock(&sk->sk_lock.slock);
2407	/*
2408	 * The sk_lock has mutex_lock() semantics here:
2409	 */
2410	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2411	local_bh_enable();
2412	return true;
2413}
2414EXPORT_SYMBOL(lock_sock_fast);
2415
2416int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2417{
2418	struct timeval tv;
2419	if (!sock_flag(sk, SOCK_TIMESTAMP))
2420		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2421	tv = ktime_to_timeval(sk->sk_stamp);
2422	if (tv.tv_sec == -1)
2423		return -ENOENT;
2424	if (tv.tv_sec == 0) {
2425		sk->sk_stamp = ktime_get_real();
2426		tv = ktime_to_timeval(sk->sk_stamp);
2427	}
2428	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2429}
2430EXPORT_SYMBOL(sock_get_timestamp);
2431
2432int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2433{
2434	struct timespec ts;
2435	if (!sock_flag(sk, SOCK_TIMESTAMP))
2436		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2437	ts = ktime_to_timespec(sk->sk_stamp);
2438	if (ts.tv_sec == -1)
2439		return -ENOENT;
2440	if (ts.tv_sec == 0) {
2441		sk->sk_stamp = ktime_get_real();
2442		ts = ktime_to_timespec(sk->sk_stamp);
2443	}
2444	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2445}
2446EXPORT_SYMBOL(sock_get_timestampns);
2447
2448void sock_enable_timestamp(struct sock *sk, int flag)
2449{
2450	if (!sock_flag(sk, flag)) {
2451		unsigned long previous_flags = sk->sk_flags;
2452
2453		sock_set_flag(sk, flag);
2454		/*
2455		 * we just set one of the two flags which require net
2456		 * time stamping, but time stamping might have been on
2457		 * already because of the other one
2458		 */
2459		if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2460			net_enable_timestamp();
2461	}
2462}
2463
2464int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2465		       int level, int type)
2466{
2467	struct sock_exterr_skb *serr;
2468	struct sk_buff *skb, *skb2;
2469	int copied, err;
2470
2471	err = -EAGAIN;
2472	skb = skb_dequeue(&sk->sk_error_queue);
2473	if (skb == NULL)
2474		goto out;
2475
2476	copied = skb->len;
2477	if (copied > len) {
2478		msg->msg_flags |= MSG_TRUNC;
2479		copied = len;
2480	}
2481	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2482	if (err)
2483		goto out_free_skb;
2484
2485	sock_recv_timestamp(msg, sk, skb);
2486
2487	serr = SKB_EXT_ERR(skb);
2488	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2489
2490	msg->msg_flags |= MSG_ERRQUEUE;
2491	err = copied;
2492
2493	/* Reset and regenerate socket error */
2494	spin_lock_bh(&sk->sk_error_queue.lock);
2495	sk->sk_err = 0;
2496	if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2497		sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2498		spin_unlock_bh(&sk->sk_error_queue.lock);
2499		sk->sk_error_report(sk);
2500	} else
2501		spin_unlock_bh(&sk->sk_error_queue.lock);
2502
2503out_free_skb:
2504	kfree_skb(skb);
2505out:
2506	return err;
2507}
2508EXPORT_SYMBOL(sock_recv_errqueue);
2509
2510/*
2511 *	Get a socket option on an socket.
2512 *
2513 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2514 *	asynchronous errors should be reported by getsockopt. We assume
2515 *	this means if you specify SO_ERROR (otherwise whats the point of it).
2516 */
2517int sock_common_getsockopt(struct socket *sock, int level, int optname,
2518			   char __user *optval, int __user *optlen)
2519{
2520	struct sock *sk = sock->sk;
2521
2522	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2523}
2524EXPORT_SYMBOL(sock_common_getsockopt);
2525
2526#ifdef CONFIG_COMPAT
2527int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2528				  char __user *optval, int __user *optlen)
2529{
2530	struct sock *sk = sock->sk;
2531
2532	if (sk->sk_prot->compat_getsockopt != NULL)
2533		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2534						      optval, optlen);
2535	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2536}
2537EXPORT_SYMBOL(compat_sock_common_getsockopt);
2538#endif
2539
2540int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2541			struct msghdr *msg, size_t size, int flags)
2542{
2543	struct sock *sk = sock->sk;
2544	int addr_len = 0;
2545	int err;
2546
2547	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2548				   flags & ~MSG_DONTWAIT, &addr_len);
2549	if (err >= 0)
2550		msg->msg_namelen = addr_len;
2551	return err;
2552}
2553EXPORT_SYMBOL(sock_common_recvmsg);
2554
2555/*
2556 *	Set socket options on an inet socket.
2557 */
2558int sock_common_setsockopt(struct socket *sock, int level, int optname,
2559			   char __user *optval, unsigned int optlen)
2560{
2561	struct sock *sk = sock->sk;
2562
2563	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2564}
2565EXPORT_SYMBOL(sock_common_setsockopt);
2566
2567#ifdef CONFIG_COMPAT
2568int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2569				  char __user *optval, unsigned int optlen)
2570{
2571	struct sock *sk = sock->sk;
2572
2573	if (sk->sk_prot->compat_setsockopt != NULL)
2574		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2575						      optval, optlen);
2576	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2577}
2578EXPORT_SYMBOL(compat_sock_common_setsockopt);
2579#endif
2580
2581void sk_common_release(struct sock *sk)
2582{
2583	if (sk->sk_prot->destroy)
2584		sk->sk_prot->destroy(sk);
2585
2586	/*
2587	 * Observation: when sock_common_release is called, processes have
2588	 * no access to socket. But net still has.
2589	 * Step one, detach it from networking:
2590	 *
2591	 * A. Remove from hash tables.
2592	 */
2593
2594	sk->sk_prot->unhash(sk);
2595
2596	/*
2597	 * In this point socket cannot receive new packets, but it is possible
2598	 * that some packets are in flight because some CPU runs receiver and
2599	 * did hash table lookup before we unhashed socket. They will achieve
2600	 * receive queue and will be purged by socket destructor.
2601	 *
2602	 * Also we still have packets pending on receive queue and probably,
2603	 * our own packets waiting in device queues. sock_destroy will drain
2604	 * receive queue, but transmitted packets will delay socket destruction
2605	 * until the last reference will be released.
2606	 */
2607
2608	sock_orphan(sk);
2609
2610	xfrm_sk_free_policy(sk);
2611
2612	sk_refcnt_debug_release(sk);
2613
2614	if (sk->sk_frag.page) {
2615		put_page(sk->sk_frag.page);
2616		sk->sk_frag.page = NULL;
2617	}
2618
2619	sock_put(sk);
2620}
2621EXPORT_SYMBOL(sk_common_release);
2622
2623#ifdef CONFIG_PROC_FS
2624#define PROTO_INUSE_NR	64	/* should be enough for the first time */
2625struct prot_inuse {
2626	int val[PROTO_INUSE_NR];
2627};
2628
2629static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2630
2631#ifdef CONFIG_NET_NS
2632void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2633{
2634	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2635}
2636EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2637
2638int sock_prot_inuse_get(struct net *net, struct proto *prot)
2639{
2640	int cpu, idx = prot->inuse_idx;
2641	int res = 0;
2642
2643	for_each_possible_cpu(cpu)
2644		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2645
2646	return res >= 0 ? res : 0;
2647}
2648EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2649
2650static int __net_init sock_inuse_init_net(struct net *net)
2651{
2652	net->core.inuse = alloc_percpu(struct prot_inuse);
2653	return net->core.inuse ? 0 : -ENOMEM;
2654}
2655
2656static void __net_exit sock_inuse_exit_net(struct net *net)
2657{
2658	free_percpu(net->core.inuse);
2659}
2660
2661static struct pernet_operations net_inuse_ops = {
2662	.init = sock_inuse_init_net,
2663	.exit = sock_inuse_exit_net,
2664};
2665
2666static __init int net_inuse_init(void)
2667{
2668	if (register_pernet_subsys(&net_inuse_ops))
2669		panic("Cannot initialize net inuse counters");
2670
2671	return 0;
2672}
2673
2674core_initcall(net_inuse_init);
2675#else
2676static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2677
2678void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2679{
2680	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2681}
2682EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2683
2684int sock_prot_inuse_get(struct net *net, struct proto *prot)
2685{
2686	int cpu, idx = prot->inuse_idx;
2687	int res = 0;
2688
2689	for_each_possible_cpu(cpu)
2690		res += per_cpu(prot_inuse, cpu).val[idx];
2691
2692	return res >= 0 ? res : 0;
2693}
2694EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2695#endif
2696
2697static void assign_proto_idx(struct proto *prot)
2698{
2699	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2700
2701	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2702		pr_err("PROTO_INUSE_NR exhausted\n");
2703		return;
2704	}
2705
2706	set_bit(prot->inuse_idx, proto_inuse_idx);
2707}
2708
2709static void release_proto_idx(struct proto *prot)
2710{
2711	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2712		clear_bit(prot->inuse_idx, proto_inuse_idx);
2713}
2714#else
2715static inline void assign_proto_idx(struct proto *prot)
2716{
2717}
2718
2719static inline void release_proto_idx(struct proto *prot)
2720{
2721}
2722#endif
2723
2724int proto_register(struct proto *prot, int alloc_slab)
2725{
2726	if (alloc_slab) {
2727		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2728					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2729					NULL);
2730
2731		if (prot->slab == NULL) {
2732			pr_crit("%s: Can't create sock SLAB cache!\n",
2733				prot->name);
2734			goto out;
2735		}
2736
2737		if (prot->rsk_prot != NULL) {
2738			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2739			if (prot->rsk_prot->slab_name == NULL)
2740				goto out_free_sock_slab;
2741
2742			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2743								 prot->rsk_prot->obj_size, 0,
2744								 SLAB_HWCACHE_ALIGN, NULL);
2745
2746			if (prot->rsk_prot->slab == NULL) {
2747				pr_crit("%s: Can't create request sock SLAB cache!\n",
2748					prot->name);
2749				goto out_free_request_sock_slab_name;
2750			}
2751		}
2752
2753		if (prot->twsk_prot != NULL) {
2754			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2755
2756			if (prot->twsk_prot->twsk_slab_name == NULL)
2757				goto out_free_request_sock_slab;
2758
2759			prot->twsk_prot->twsk_slab =
2760				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2761						  prot->twsk_prot->twsk_obj_size,
2762						  0,
2763						  SLAB_HWCACHE_ALIGN |
2764							prot->slab_flags,
2765						  NULL);
2766			if (prot->twsk_prot->twsk_slab == NULL)
2767				goto out_free_timewait_sock_slab_name;
2768		}
2769	}
2770
2771	mutex_lock(&proto_list_mutex);
2772	list_add(&prot->node, &proto_list);
2773	assign_proto_idx(prot);
2774	mutex_unlock(&proto_list_mutex);
2775	return 0;
2776
2777out_free_timewait_sock_slab_name:
2778	kfree(prot->twsk_prot->twsk_slab_name);
2779out_free_request_sock_slab:
2780	if (prot->rsk_prot && prot->rsk_prot->slab) {
2781		kmem_cache_destroy(prot->rsk_prot->slab);
2782		prot->rsk_prot->slab = NULL;
2783	}
2784out_free_request_sock_slab_name:
2785	if (prot->rsk_prot)
2786		kfree(prot->rsk_prot->slab_name);
2787out_free_sock_slab:
2788	kmem_cache_destroy(prot->slab);
2789	prot->slab = NULL;
2790out:
2791	return -ENOBUFS;
2792}
2793EXPORT_SYMBOL(proto_register);
2794
2795void proto_unregister(struct proto *prot)
2796{
2797	mutex_lock(&proto_list_mutex);
2798	release_proto_idx(prot);
2799	list_del(&prot->node);
2800	mutex_unlock(&proto_list_mutex);
2801
2802	if (prot->slab != NULL) {
2803		kmem_cache_destroy(prot->slab);
2804		prot->slab = NULL;
2805	}
2806
2807	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2808		kmem_cache_destroy(prot->rsk_prot->slab);
2809		kfree(prot->rsk_prot->slab_name);
2810		prot->rsk_prot->slab = NULL;
2811	}
2812
2813	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2814		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2815		kfree(prot->twsk_prot->twsk_slab_name);
2816		prot->twsk_prot->twsk_slab = NULL;
2817	}
2818}
2819EXPORT_SYMBOL(proto_unregister);
2820
2821#ifdef CONFIG_PROC_FS
2822static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2823	__acquires(proto_list_mutex)
2824{
2825	mutex_lock(&proto_list_mutex);
2826	return seq_list_start_head(&proto_list, *pos);
2827}
2828
2829static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2830{
2831	return seq_list_next(v, &proto_list, pos);
2832}
2833
2834static void proto_seq_stop(struct seq_file *seq, void *v)
2835	__releases(proto_list_mutex)
2836{
2837	mutex_unlock(&proto_list_mutex);
2838}
2839
2840static char proto_method_implemented(const void *method)
2841{
2842	return method == NULL ? 'n' : 'y';
2843}
2844static long sock_prot_memory_allocated(struct proto *proto)
2845{
2846	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2847}
2848
2849static char *sock_prot_memory_pressure(struct proto *proto)
2850{
2851	return proto->memory_pressure != NULL ?
2852	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2853}
2854
2855static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2856{
2857
2858	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2859			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2860		   proto->name,
2861		   proto->obj_size,
2862		   sock_prot_inuse_get(seq_file_net(seq), proto),
2863		   sock_prot_memory_allocated(proto),
2864		   sock_prot_memory_pressure(proto),
2865		   proto->max_header,
2866		   proto->slab == NULL ? "no" : "yes",
2867		   module_name(proto->owner),
2868		   proto_method_implemented(proto->close),
2869		   proto_method_implemented(proto->connect),
2870		   proto_method_implemented(proto->disconnect),
2871		   proto_method_implemented(proto->accept),
2872		   proto_method_implemented(proto->ioctl),
2873		   proto_method_implemented(proto->init),
2874		   proto_method_implemented(proto->destroy),
2875		   proto_method_implemented(proto->shutdown),
2876		   proto_method_implemented(proto->setsockopt),
2877		   proto_method_implemented(proto->getsockopt),
2878		   proto_method_implemented(proto->sendmsg),
2879		   proto_method_implemented(proto->recvmsg),
2880		   proto_method_implemented(proto->sendpage),
2881		   proto_method_implemented(proto->bind),
2882		   proto_method_implemented(proto->backlog_rcv),
2883		   proto_method_implemented(proto->hash),
2884		   proto_method_implemented(proto->unhash),
2885		   proto_method_implemented(proto->get_port),
2886		   proto_method_implemented(proto->enter_memory_pressure));
2887}
2888
2889static int proto_seq_show(struct seq_file *seq, void *v)
2890{
2891	if (v == &proto_list)
2892		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2893			   "protocol",
2894			   "size",
2895			   "sockets",
2896			   "memory",
2897			   "press",
2898			   "maxhdr",
2899			   "slab",
2900			   "module",
2901			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2902	else
2903		proto_seq_printf(seq, list_entry(v, struct proto, node));
2904	return 0;
2905}
2906
2907static const struct seq_operations proto_seq_ops = {
2908	.start  = proto_seq_start,
2909	.next   = proto_seq_next,
2910	.stop   = proto_seq_stop,
2911	.show   = proto_seq_show,
2912};
2913
2914static int proto_seq_open(struct inode *inode, struct file *file)
2915{
2916	return seq_open_net(inode, file, &proto_seq_ops,
2917			    sizeof(struct seq_net_private));
2918}
2919
2920static const struct file_operations proto_seq_fops = {
2921	.owner		= THIS_MODULE,
2922	.open		= proto_seq_open,
2923	.read		= seq_read,
2924	.llseek		= seq_lseek,
2925	.release	= seq_release_net,
2926};
2927
2928static __net_init int proto_init_net(struct net *net)
2929{
2930	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2931		return -ENOMEM;
2932
2933	return 0;
2934}
2935
2936static __net_exit void proto_exit_net(struct net *net)
2937{
2938	remove_proc_entry("protocols", net->proc_net);
2939}
2940
2941
2942static __net_initdata struct pernet_operations proto_net_ops = {
2943	.init = proto_init_net,
2944	.exit = proto_exit_net,
2945};
2946
2947static int __init proto_init(void)
2948{
2949	return register_pernet_subsys(&proto_net_ops);
2950}
2951
2952subsys_initcall(proto_init);
2953
2954#endif /* PROC_FS */
2955