sock.c revision d59577b6ffd313d0ab3be39cb1ab47e29bdc9182
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 *		Alan Cox	: 	Numerous verify_area() problems
17 *		Alan Cox	:	Connecting on a connecting socket
18 *					now returns an error for tcp.
19 *		Alan Cox	:	sock->protocol is set correctly.
20 *					and is not sometimes left as 0.
21 *		Alan Cox	:	connect handles icmp errors on a
22 *					connect properly. Unfortunately there
23 *					is a restart syscall nasty there. I
24 *					can't match BSD without hacking the C
25 *					library. Ideas urgently sought!
26 *		Alan Cox	:	Disallow bind() to addresses that are
27 *					not ours - especially broadcast ones!!
28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30 *					instead they leave that for the DESTROY timer.
31 *		Alan Cox	:	Clean up error flag in accept
32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33 *					was buggy. Put a remove_sock() in the handler
34 *					for memory when we hit 0. Also altered the timer
35 *					code. The ACK stuff can wait and needs major
36 *					TCP layer surgery.
37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38 *					and fixed timer/inet_bh race.
39 *		Alan Cox	:	Added zapped flag for TCP
40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47 *	Pauline Middelink	:	identd support
48 *		Alan Cox	:	Fixed connect() taking signals I think.
49 *		Alan Cox	:	SO_LINGER supported
50 *		Alan Cox	:	Error reporting fixes
51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52 *		Alan Cox	:	inet sockets don't set sk->type!
53 *		Alan Cox	:	Split socket option code
54 *		Alan Cox	:	Callbacks
55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56 *		Alex		:	Removed restriction on inet fioctl
57 *		Alan Cox	:	Splitting INET from NET core
58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60 *		Alan Cox	:	Split IP from generic code
61 *		Alan Cox	:	New kfree_skbmem()
62 *		Alan Cox	:	Make SO_DEBUG superuser only.
63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64 *					(compatibility fix)
65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66 *		Alan Cox	:	Allocator for a socket is settable.
67 *		Alan Cox	:	SO_ERROR includes soft errors.
68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69 *		Alan Cox	: 	Generic socket allocation to make hooks
70 *					easier (suggested by Craig Metz).
71 *		Michael Pall	:	SO_ERROR returns positive errno again
72 *              Steve Whitehouse:       Added default destructor to free
73 *                                      protocol private data.
74 *              Steve Whitehouse:       Added various other default routines
75 *                                      common to several socket families.
76 *              Chris Evans     :       Call suser() check last on F_SETOWN
77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79 *		Andi Kleen	:	Fix write_space callback
80 *		Chris Evans	:	Security fixes - signedness again
81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 *		This program is free software; you can redistribute it and/or
87 *		modify it under the terms of the GNU General Public License
88 *		as published by the Free Software Foundation; either version
89 *		2 of the License, or (at your option) any later version.
90 */
91
92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
94#include <linux/capability.h>
95#include <linux/errno.h>
96#include <linux/types.h>
97#include <linux/socket.h>
98#include <linux/in.h>
99#include <linux/kernel.h>
100#include <linux/module.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/sched.h>
104#include <linux/timer.h>
105#include <linux/string.h>
106#include <linux/sockios.h>
107#include <linux/net.h>
108#include <linux/mm.h>
109#include <linux/slab.h>
110#include <linux/interrupt.h>
111#include <linux/poll.h>
112#include <linux/tcp.h>
113#include <linux/init.h>
114#include <linux/highmem.h>
115#include <linux/user_namespace.h>
116#include <linux/static_key.h>
117#include <linux/memcontrol.h>
118#include <linux/prefetch.h>
119
120#include <asm/uaccess.h>
121
122#include <linux/netdevice.h>
123#include <net/protocol.h>
124#include <linux/skbuff.h>
125#include <net/net_namespace.h>
126#include <net/request_sock.h>
127#include <net/sock.h>
128#include <linux/net_tstamp.h>
129#include <net/xfrm.h>
130#include <linux/ipsec.h>
131#include <net/cls_cgroup.h>
132#include <net/netprio_cgroup.h>
133
134#include <linux/filter.h>
135
136#include <trace/events/sock.h>
137
138#ifdef CONFIG_INET
139#include <net/tcp.h>
140#endif
141
142static DEFINE_MUTEX(proto_list_mutex);
143static LIST_HEAD(proto_list);
144
145#ifdef CONFIG_MEMCG_KMEM
146int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
147{
148	struct proto *proto;
149	int ret = 0;
150
151	mutex_lock(&proto_list_mutex);
152	list_for_each_entry(proto, &proto_list, node) {
153		if (proto->init_cgroup) {
154			ret = proto->init_cgroup(memcg, ss);
155			if (ret)
156				goto out;
157		}
158	}
159
160	mutex_unlock(&proto_list_mutex);
161	return ret;
162out:
163	list_for_each_entry_continue_reverse(proto, &proto_list, node)
164		if (proto->destroy_cgroup)
165			proto->destroy_cgroup(memcg);
166	mutex_unlock(&proto_list_mutex);
167	return ret;
168}
169
170void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
171{
172	struct proto *proto;
173
174	mutex_lock(&proto_list_mutex);
175	list_for_each_entry_reverse(proto, &proto_list, node)
176		if (proto->destroy_cgroup)
177			proto->destroy_cgroup(memcg);
178	mutex_unlock(&proto_list_mutex);
179}
180#endif
181
182/*
183 * Each address family might have different locking rules, so we have
184 * one slock key per address family:
185 */
186static struct lock_class_key af_family_keys[AF_MAX];
187static struct lock_class_key af_family_slock_keys[AF_MAX];
188
189struct static_key memcg_socket_limit_enabled;
190EXPORT_SYMBOL(memcg_socket_limit_enabled);
191
192/*
193 * Make lock validator output more readable. (we pre-construct these
194 * strings build-time, so that runtime initialization of socket
195 * locks is fast):
196 */
197static const char *const af_family_key_strings[AF_MAX+1] = {
198  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
199  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
200  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
201  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
202  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
203  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
204  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
205  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
206  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
207  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
208  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
209  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
210  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
211  "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
212};
213static const char *const af_family_slock_key_strings[AF_MAX+1] = {
214  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
215  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
216  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
217  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
218  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
219  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
220  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
221  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
222  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
223  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
224  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
225  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
226  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
227  "slock-AF_NFC"   , "slock-AF_MAX"
228};
229static const char *const af_family_clock_key_strings[AF_MAX+1] = {
230  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
231  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
232  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
233  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
234  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
235  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
236  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
237  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
238  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
239  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
240  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
241  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
242  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
243  "clock-AF_NFC"   , "clock-AF_MAX"
244};
245
246/*
247 * sk_callback_lock locking rules are per-address-family,
248 * so split the lock classes by using a per-AF key:
249 */
250static struct lock_class_key af_callback_keys[AF_MAX];
251
252/* Take into consideration the size of the struct sk_buff overhead in the
253 * determination of these values, since that is non-constant across
254 * platforms.  This makes socket queueing behavior and performance
255 * not depend upon such differences.
256 */
257#define _SK_MEM_PACKETS		256
258#define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
259#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
260#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
261
262/* Run time adjustable parameters. */
263__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
264EXPORT_SYMBOL(sysctl_wmem_max);
265__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
266EXPORT_SYMBOL(sysctl_rmem_max);
267__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
268__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
269
270/* Maximal space eaten by iovec or ancillary data plus some space */
271int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
272EXPORT_SYMBOL(sysctl_optmem_max);
273
274struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
275EXPORT_SYMBOL_GPL(memalloc_socks);
276
277/**
278 * sk_set_memalloc - sets %SOCK_MEMALLOC
279 * @sk: socket to set it on
280 *
281 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
282 * It's the responsibility of the admin to adjust min_free_kbytes
283 * to meet the requirements
284 */
285void sk_set_memalloc(struct sock *sk)
286{
287	sock_set_flag(sk, SOCK_MEMALLOC);
288	sk->sk_allocation |= __GFP_MEMALLOC;
289	static_key_slow_inc(&memalloc_socks);
290}
291EXPORT_SYMBOL_GPL(sk_set_memalloc);
292
293void sk_clear_memalloc(struct sock *sk)
294{
295	sock_reset_flag(sk, SOCK_MEMALLOC);
296	sk->sk_allocation &= ~__GFP_MEMALLOC;
297	static_key_slow_dec(&memalloc_socks);
298
299	/*
300	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
301	 * progress of swapping. However, if SOCK_MEMALLOC is cleared while
302	 * it has rmem allocations there is a risk that the user of the
303	 * socket cannot make forward progress due to exceeding the rmem
304	 * limits. By rights, sk_clear_memalloc() should only be called
305	 * on sockets being torn down but warn and reset the accounting if
306	 * that assumption breaks.
307	 */
308	if (WARN_ON(sk->sk_forward_alloc))
309		sk_mem_reclaim(sk);
310}
311EXPORT_SYMBOL_GPL(sk_clear_memalloc);
312
313int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
314{
315	int ret;
316	unsigned long pflags = current->flags;
317
318	/* these should have been dropped before queueing */
319	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
320
321	current->flags |= PF_MEMALLOC;
322	ret = sk->sk_backlog_rcv(sk, skb);
323	tsk_restore_flags(current, pflags, PF_MEMALLOC);
324
325	return ret;
326}
327EXPORT_SYMBOL(__sk_backlog_rcv);
328
329static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
330{
331	struct timeval tv;
332
333	if (optlen < sizeof(tv))
334		return -EINVAL;
335	if (copy_from_user(&tv, optval, sizeof(tv)))
336		return -EFAULT;
337	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
338		return -EDOM;
339
340	if (tv.tv_sec < 0) {
341		static int warned __read_mostly;
342
343		*timeo_p = 0;
344		if (warned < 10 && net_ratelimit()) {
345			warned++;
346			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
347				__func__, current->comm, task_pid_nr(current));
348		}
349		return 0;
350	}
351	*timeo_p = MAX_SCHEDULE_TIMEOUT;
352	if (tv.tv_sec == 0 && tv.tv_usec == 0)
353		return 0;
354	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
355		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
356	return 0;
357}
358
359static void sock_warn_obsolete_bsdism(const char *name)
360{
361	static int warned;
362	static char warncomm[TASK_COMM_LEN];
363	if (strcmp(warncomm, current->comm) && warned < 5) {
364		strcpy(warncomm,  current->comm);
365		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
366			warncomm, name);
367		warned++;
368	}
369}
370
371#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
372
373static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
374{
375	if (sk->sk_flags & flags) {
376		sk->sk_flags &= ~flags;
377		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
378			net_disable_timestamp();
379	}
380}
381
382
383int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
384{
385	int err;
386	int skb_len;
387	unsigned long flags;
388	struct sk_buff_head *list = &sk->sk_receive_queue;
389
390	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
391		atomic_inc(&sk->sk_drops);
392		trace_sock_rcvqueue_full(sk, skb);
393		return -ENOMEM;
394	}
395
396	err = sk_filter(sk, skb);
397	if (err)
398		return err;
399
400	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
401		atomic_inc(&sk->sk_drops);
402		return -ENOBUFS;
403	}
404
405	skb->dev = NULL;
406	skb_set_owner_r(skb, sk);
407
408	/* Cache the SKB length before we tack it onto the receive
409	 * queue.  Once it is added it no longer belongs to us and
410	 * may be freed by other threads of control pulling packets
411	 * from the queue.
412	 */
413	skb_len = skb->len;
414
415	/* we escape from rcu protected region, make sure we dont leak
416	 * a norefcounted dst
417	 */
418	skb_dst_force(skb);
419
420	spin_lock_irqsave(&list->lock, flags);
421	skb->dropcount = atomic_read(&sk->sk_drops);
422	__skb_queue_tail(list, skb);
423	spin_unlock_irqrestore(&list->lock, flags);
424
425	if (!sock_flag(sk, SOCK_DEAD))
426		sk->sk_data_ready(sk, skb_len);
427	return 0;
428}
429EXPORT_SYMBOL(sock_queue_rcv_skb);
430
431int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
432{
433	int rc = NET_RX_SUCCESS;
434
435	if (sk_filter(sk, skb))
436		goto discard_and_relse;
437
438	skb->dev = NULL;
439
440	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
441		atomic_inc(&sk->sk_drops);
442		goto discard_and_relse;
443	}
444	if (nested)
445		bh_lock_sock_nested(sk);
446	else
447		bh_lock_sock(sk);
448	if (!sock_owned_by_user(sk)) {
449		/*
450		 * trylock + unlock semantics:
451		 */
452		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
453
454		rc = sk_backlog_rcv(sk, skb);
455
456		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
457	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
458		bh_unlock_sock(sk);
459		atomic_inc(&sk->sk_drops);
460		goto discard_and_relse;
461	}
462
463	bh_unlock_sock(sk);
464out:
465	sock_put(sk);
466	return rc;
467discard_and_relse:
468	kfree_skb(skb);
469	goto out;
470}
471EXPORT_SYMBOL(sk_receive_skb);
472
473void sk_reset_txq(struct sock *sk)
474{
475	sk_tx_queue_clear(sk);
476}
477EXPORT_SYMBOL(sk_reset_txq);
478
479struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
480{
481	struct dst_entry *dst = __sk_dst_get(sk);
482
483	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
484		sk_tx_queue_clear(sk);
485		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
486		dst_release(dst);
487		return NULL;
488	}
489
490	return dst;
491}
492EXPORT_SYMBOL(__sk_dst_check);
493
494struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
495{
496	struct dst_entry *dst = sk_dst_get(sk);
497
498	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
499		sk_dst_reset(sk);
500		dst_release(dst);
501		return NULL;
502	}
503
504	return dst;
505}
506EXPORT_SYMBOL(sk_dst_check);
507
508static int sock_setbindtodevice(struct sock *sk, char __user *optval,
509				int optlen)
510{
511	int ret = -ENOPROTOOPT;
512#ifdef CONFIG_NETDEVICES
513	struct net *net = sock_net(sk);
514	char devname[IFNAMSIZ];
515	int index;
516
517	/* Sorry... */
518	ret = -EPERM;
519	if (!ns_capable(net->user_ns, CAP_NET_RAW))
520		goto out;
521
522	ret = -EINVAL;
523	if (optlen < 0)
524		goto out;
525
526	/* Bind this socket to a particular device like "eth0",
527	 * as specified in the passed interface name. If the
528	 * name is "" or the option length is zero the socket
529	 * is not bound.
530	 */
531	if (optlen > IFNAMSIZ - 1)
532		optlen = IFNAMSIZ - 1;
533	memset(devname, 0, sizeof(devname));
534
535	ret = -EFAULT;
536	if (copy_from_user(devname, optval, optlen))
537		goto out;
538
539	index = 0;
540	if (devname[0] != '\0') {
541		struct net_device *dev;
542
543		rcu_read_lock();
544		dev = dev_get_by_name_rcu(net, devname);
545		if (dev)
546			index = dev->ifindex;
547		rcu_read_unlock();
548		ret = -ENODEV;
549		if (!dev)
550			goto out;
551	}
552
553	lock_sock(sk);
554	sk->sk_bound_dev_if = index;
555	sk_dst_reset(sk);
556	release_sock(sk);
557
558	ret = 0;
559
560out:
561#endif
562
563	return ret;
564}
565
566static int sock_getbindtodevice(struct sock *sk, char __user *optval,
567				int __user *optlen, int len)
568{
569	int ret = -ENOPROTOOPT;
570#ifdef CONFIG_NETDEVICES
571	struct net *net = sock_net(sk);
572	struct net_device *dev;
573	char devname[IFNAMSIZ];
574	unsigned seq;
575
576	if (sk->sk_bound_dev_if == 0) {
577		len = 0;
578		goto zero;
579	}
580
581	ret = -EINVAL;
582	if (len < IFNAMSIZ)
583		goto out;
584
585retry:
586	seq = read_seqcount_begin(&devnet_rename_seq);
587	rcu_read_lock();
588	dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
589	ret = -ENODEV;
590	if (!dev) {
591		rcu_read_unlock();
592		goto out;
593	}
594
595	strcpy(devname, dev->name);
596	rcu_read_unlock();
597	if (read_seqcount_retry(&devnet_rename_seq, seq))
598		goto retry;
599
600	len = strlen(devname) + 1;
601
602	ret = -EFAULT;
603	if (copy_to_user(optval, devname, len))
604		goto out;
605
606zero:
607	ret = -EFAULT;
608	if (put_user(len, optlen))
609		goto out;
610
611	ret = 0;
612
613out:
614#endif
615
616	return ret;
617}
618
619static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
620{
621	if (valbool)
622		sock_set_flag(sk, bit);
623	else
624		sock_reset_flag(sk, bit);
625}
626
627/*
628 *	This is meant for all protocols to use and covers goings on
629 *	at the socket level. Everything here is generic.
630 */
631
632int sock_setsockopt(struct socket *sock, int level, int optname,
633		    char __user *optval, unsigned int optlen)
634{
635	struct sock *sk = sock->sk;
636	int val;
637	int valbool;
638	struct linger ling;
639	int ret = 0;
640
641	/*
642	 *	Options without arguments
643	 */
644
645	if (optname == SO_BINDTODEVICE)
646		return sock_setbindtodevice(sk, optval, optlen);
647
648	if (optlen < sizeof(int))
649		return -EINVAL;
650
651	if (get_user(val, (int __user *)optval))
652		return -EFAULT;
653
654	valbool = val ? 1 : 0;
655
656	lock_sock(sk);
657
658	switch (optname) {
659	case SO_DEBUG:
660		if (val && !capable(CAP_NET_ADMIN))
661			ret = -EACCES;
662		else
663			sock_valbool_flag(sk, SOCK_DBG, valbool);
664		break;
665	case SO_REUSEADDR:
666		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
667		break;
668	case SO_TYPE:
669	case SO_PROTOCOL:
670	case SO_DOMAIN:
671	case SO_ERROR:
672		ret = -ENOPROTOOPT;
673		break;
674	case SO_DONTROUTE:
675		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
676		break;
677	case SO_BROADCAST:
678		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
679		break;
680	case SO_SNDBUF:
681		/* Don't error on this BSD doesn't and if you think
682		 * about it this is right. Otherwise apps have to
683		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
684		 * are treated in BSD as hints
685		 */
686		val = min_t(u32, val, sysctl_wmem_max);
687set_sndbuf:
688		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
689		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
690		/* Wake up sending tasks if we upped the value. */
691		sk->sk_write_space(sk);
692		break;
693
694	case SO_SNDBUFFORCE:
695		if (!capable(CAP_NET_ADMIN)) {
696			ret = -EPERM;
697			break;
698		}
699		goto set_sndbuf;
700
701	case SO_RCVBUF:
702		/* Don't error on this BSD doesn't and if you think
703		 * about it this is right. Otherwise apps have to
704		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
705		 * are treated in BSD as hints
706		 */
707		val = min_t(u32, val, sysctl_rmem_max);
708set_rcvbuf:
709		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
710		/*
711		 * We double it on the way in to account for
712		 * "struct sk_buff" etc. overhead.   Applications
713		 * assume that the SO_RCVBUF setting they make will
714		 * allow that much actual data to be received on that
715		 * socket.
716		 *
717		 * Applications are unaware that "struct sk_buff" and
718		 * other overheads allocate from the receive buffer
719		 * during socket buffer allocation.
720		 *
721		 * And after considering the possible alternatives,
722		 * returning the value we actually used in getsockopt
723		 * is the most desirable behavior.
724		 */
725		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
726		break;
727
728	case SO_RCVBUFFORCE:
729		if (!capable(CAP_NET_ADMIN)) {
730			ret = -EPERM;
731			break;
732		}
733		goto set_rcvbuf;
734
735	case SO_KEEPALIVE:
736#ifdef CONFIG_INET
737		if (sk->sk_protocol == IPPROTO_TCP &&
738		    sk->sk_type == SOCK_STREAM)
739			tcp_set_keepalive(sk, valbool);
740#endif
741		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
742		break;
743
744	case SO_OOBINLINE:
745		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
746		break;
747
748	case SO_NO_CHECK:
749		sk->sk_no_check = valbool;
750		break;
751
752	case SO_PRIORITY:
753		if ((val >= 0 && val <= 6) ||
754		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
755			sk->sk_priority = val;
756		else
757			ret = -EPERM;
758		break;
759
760	case SO_LINGER:
761		if (optlen < sizeof(ling)) {
762			ret = -EINVAL;	/* 1003.1g */
763			break;
764		}
765		if (copy_from_user(&ling, optval, sizeof(ling))) {
766			ret = -EFAULT;
767			break;
768		}
769		if (!ling.l_onoff)
770			sock_reset_flag(sk, SOCK_LINGER);
771		else {
772#if (BITS_PER_LONG == 32)
773			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
774				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
775			else
776#endif
777				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
778			sock_set_flag(sk, SOCK_LINGER);
779		}
780		break;
781
782	case SO_BSDCOMPAT:
783		sock_warn_obsolete_bsdism("setsockopt");
784		break;
785
786	case SO_PASSCRED:
787		if (valbool)
788			set_bit(SOCK_PASSCRED, &sock->flags);
789		else
790			clear_bit(SOCK_PASSCRED, &sock->flags);
791		break;
792
793	case SO_TIMESTAMP:
794	case SO_TIMESTAMPNS:
795		if (valbool)  {
796			if (optname == SO_TIMESTAMP)
797				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
798			else
799				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
800			sock_set_flag(sk, SOCK_RCVTSTAMP);
801			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
802		} else {
803			sock_reset_flag(sk, SOCK_RCVTSTAMP);
804			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
805		}
806		break;
807
808	case SO_TIMESTAMPING:
809		if (val & ~SOF_TIMESTAMPING_MASK) {
810			ret = -EINVAL;
811			break;
812		}
813		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
814				  val & SOF_TIMESTAMPING_TX_HARDWARE);
815		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
816				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
817		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
818				  val & SOF_TIMESTAMPING_RX_HARDWARE);
819		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
820			sock_enable_timestamp(sk,
821					      SOCK_TIMESTAMPING_RX_SOFTWARE);
822		else
823			sock_disable_timestamp(sk,
824					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
825		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
826				  val & SOF_TIMESTAMPING_SOFTWARE);
827		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
828				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
829		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
830				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
831		break;
832
833	case SO_RCVLOWAT:
834		if (val < 0)
835			val = INT_MAX;
836		sk->sk_rcvlowat = val ? : 1;
837		break;
838
839	case SO_RCVTIMEO:
840		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
841		break;
842
843	case SO_SNDTIMEO:
844		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
845		break;
846
847	case SO_ATTACH_FILTER:
848		ret = -EINVAL;
849		if (optlen == sizeof(struct sock_fprog)) {
850			struct sock_fprog fprog;
851
852			ret = -EFAULT;
853			if (copy_from_user(&fprog, optval, sizeof(fprog)))
854				break;
855
856			ret = sk_attach_filter(&fprog, sk);
857		}
858		break;
859
860	case SO_DETACH_FILTER:
861		ret = sk_detach_filter(sk);
862		break;
863
864	case SO_LOCK_FILTER:
865		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
866			ret = -EPERM;
867		else
868			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
869		break;
870
871	case SO_PASSSEC:
872		if (valbool)
873			set_bit(SOCK_PASSSEC, &sock->flags);
874		else
875			clear_bit(SOCK_PASSSEC, &sock->flags);
876		break;
877	case SO_MARK:
878		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
879			ret = -EPERM;
880		else
881			sk->sk_mark = val;
882		break;
883
884		/* We implement the SO_SNDLOWAT etc to
885		   not be settable (1003.1g 5.3) */
886	case SO_RXQ_OVFL:
887		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
888		break;
889
890	case SO_WIFI_STATUS:
891		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
892		break;
893
894	case SO_PEEK_OFF:
895		if (sock->ops->set_peek_off)
896			sock->ops->set_peek_off(sk, val);
897		else
898			ret = -EOPNOTSUPP;
899		break;
900
901	case SO_NOFCS:
902		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
903		break;
904
905	default:
906		ret = -ENOPROTOOPT;
907		break;
908	}
909	release_sock(sk);
910	return ret;
911}
912EXPORT_SYMBOL(sock_setsockopt);
913
914
915void cred_to_ucred(struct pid *pid, const struct cred *cred,
916		   struct ucred *ucred)
917{
918	ucred->pid = pid_vnr(pid);
919	ucred->uid = ucred->gid = -1;
920	if (cred) {
921		struct user_namespace *current_ns = current_user_ns();
922
923		ucred->uid = from_kuid_munged(current_ns, cred->euid);
924		ucred->gid = from_kgid_munged(current_ns, cred->egid);
925	}
926}
927EXPORT_SYMBOL_GPL(cred_to_ucred);
928
929int sock_getsockopt(struct socket *sock, int level, int optname,
930		    char __user *optval, int __user *optlen)
931{
932	struct sock *sk = sock->sk;
933
934	union {
935		int val;
936		struct linger ling;
937		struct timeval tm;
938	} v;
939
940	int lv = sizeof(int);
941	int len;
942
943	if (get_user(len, optlen))
944		return -EFAULT;
945	if (len < 0)
946		return -EINVAL;
947
948	memset(&v, 0, sizeof(v));
949
950	switch (optname) {
951	case SO_DEBUG:
952		v.val = sock_flag(sk, SOCK_DBG);
953		break;
954
955	case SO_DONTROUTE:
956		v.val = sock_flag(sk, SOCK_LOCALROUTE);
957		break;
958
959	case SO_BROADCAST:
960		v.val = sock_flag(sk, SOCK_BROADCAST);
961		break;
962
963	case SO_SNDBUF:
964		v.val = sk->sk_sndbuf;
965		break;
966
967	case SO_RCVBUF:
968		v.val = sk->sk_rcvbuf;
969		break;
970
971	case SO_REUSEADDR:
972		v.val = sk->sk_reuse;
973		break;
974
975	case SO_KEEPALIVE:
976		v.val = sock_flag(sk, SOCK_KEEPOPEN);
977		break;
978
979	case SO_TYPE:
980		v.val = sk->sk_type;
981		break;
982
983	case SO_PROTOCOL:
984		v.val = sk->sk_protocol;
985		break;
986
987	case SO_DOMAIN:
988		v.val = sk->sk_family;
989		break;
990
991	case SO_ERROR:
992		v.val = -sock_error(sk);
993		if (v.val == 0)
994			v.val = xchg(&sk->sk_err_soft, 0);
995		break;
996
997	case SO_OOBINLINE:
998		v.val = sock_flag(sk, SOCK_URGINLINE);
999		break;
1000
1001	case SO_NO_CHECK:
1002		v.val = sk->sk_no_check;
1003		break;
1004
1005	case SO_PRIORITY:
1006		v.val = sk->sk_priority;
1007		break;
1008
1009	case SO_LINGER:
1010		lv		= sizeof(v.ling);
1011		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1012		v.ling.l_linger	= sk->sk_lingertime / HZ;
1013		break;
1014
1015	case SO_BSDCOMPAT:
1016		sock_warn_obsolete_bsdism("getsockopt");
1017		break;
1018
1019	case SO_TIMESTAMP:
1020		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1021				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1022		break;
1023
1024	case SO_TIMESTAMPNS:
1025		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1026		break;
1027
1028	case SO_TIMESTAMPING:
1029		v.val = 0;
1030		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
1031			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
1032		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
1033			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
1034		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
1035			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
1036		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1037			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
1038		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
1039			v.val |= SOF_TIMESTAMPING_SOFTWARE;
1040		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
1041			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
1042		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
1043			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
1044		break;
1045
1046	case SO_RCVTIMEO:
1047		lv = sizeof(struct timeval);
1048		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1049			v.tm.tv_sec = 0;
1050			v.tm.tv_usec = 0;
1051		} else {
1052			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1053			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1054		}
1055		break;
1056
1057	case SO_SNDTIMEO:
1058		lv = sizeof(struct timeval);
1059		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1060			v.tm.tv_sec = 0;
1061			v.tm.tv_usec = 0;
1062		} else {
1063			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1064			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1065		}
1066		break;
1067
1068	case SO_RCVLOWAT:
1069		v.val = sk->sk_rcvlowat;
1070		break;
1071
1072	case SO_SNDLOWAT:
1073		v.val = 1;
1074		break;
1075
1076	case SO_PASSCRED:
1077		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1078		break;
1079
1080	case SO_PEERCRED:
1081	{
1082		struct ucred peercred;
1083		if (len > sizeof(peercred))
1084			len = sizeof(peercred);
1085		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1086		if (copy_to_user(optval, &peercred, len))
1087			return -EFAULT;
1088		goto lenout;
1089	}
1090
1091	case SO_PEERNAME:
1092	{
1093		char address[128];
1094
1095		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1096			return -ENOTCONN;
1097		if (lv < len)
1098			return -EINVAL;
1099		if (copy_to_user(optval, address, len))
1100			return -EFAULT;
1101		goto lenout;
1102	}
1103
1104	/* Dubious BSD thing... Probably nobody even uses it, but
1105	 * the UNIX standard wants it for whatever reason... -DaveM
1106	 */
1107	case SO_ACCEPTCONN:
1108		v.val = sk->sk_state == TCP_LISTEN;
1109		break;
1110
1111	case SO_PASSSEC:
1112		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1113		break;
1114
1115	case SO_PEERSEC:
1116		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1117
1118	case SO_MARK:
1119		v.val = sk->sk_mark;
1120		break;
1121
1122	case SO_RXQ_OVFL:
1123		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1124		break;
1125
1126	case SO_WIFI_STATUS:
1127		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1128		break;
1129
1130	case SO_PEEK_OFF:
1131		if (!sock->ops->set_peek_off)
1132			return -EOPNOTSUPP;
1133
1134		v.val = sk->sk_peek_off;
1135		break;
1136	case SO_NOFCS:
1137		v.val = sock_flag(sk, SOCK_NOFCS);
1138		break;
1139
1140	case SO_BINDTODEVICE:
1141		return sock_getbindtodevice(sk, optval, optlen, len);
1142
1143	case SO_GET_FILTER:
1144		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1145		if (len < 0)
1146			return len;
1147
1148		goto lenout;
1149
1150	case SO_LOCK_FILTER:
1151		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1152		break;
1153
1154	default:
1155		return -ENOPROTOOPT;
1156	}
1157
1158	if (len > lv)
1159		len = lv;
1160	if (copy_to_user(optval, &v, len))
1161		return -EFAULT;
1162lenout:
1163	if (put_user(len, optlen))
1164		return -EFAULT;
1165	return 0;
1166}
1167
1168/*
1169 * Initialize an sk_lock.
1170 *
1171 * (We also register the sk_lock with the lock validator.)
1172 */
1173static inline void sock_lock_init(struct sock *sk)
1174{
1175	sock_lock_init_class_and_name(sk,
1176			af_family_slock_key_strings[sk->sk_family],
1177			af_family_slock_keys + sk->sk_family,
1178			af_family_key_strings[sk->sk_family],
1179			af_family_keys + sk->sk_family);
1180}
1181
1182/*
1183 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1184 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1185 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1186 */
1187static void sock_copy(struct sock *nsk, const struct sock *osk)
1188{
1189#ifdef CONFIG_SECURITY_NETWORK
1190	void *sptr = nsk->sk_security;
1191#endif
1192	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1193
1194	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1195	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1196
1197#ifdef CONFIG_SECURITY_NETWORK
1198	nsk->sk_security = sptr;
1199	security_sk_clone(osk, nsk);
1200#endif
1201}
1202
1203/*
1204 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1205 * un-modified. Special care is taken when initializing object to zero.
1206 */
1207static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1208{
1209	if (offsetof(struct sock, sk_node.next) != 0)
1210		memset(sk, 0, offsetof(struct sock, sk_node.next));
1211	memset(&sk->sk_node.pprev, 0,
1212	       size - offsetof(struct sock, sk_node.pprev));
1213}
1214
1215void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1216{
1217	unsigned long nulls1, nulls2;
1218
1219	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1220	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1221	if (nulls1 > nulls2)
1222		swap(nulls1, nulls2);
1223
1224	if (nulls1 != 0)
1225		memset((char *)sk, 0, nulls1);
1226	memset((char *)sk + nulls1 + sizeof(void *), 0,
1227	       nulls2 - nulls1 - sizeof(void *));
1228	memset((char *)sk + nulls2 + sizeof(void *), 0,
1229	       size - nulls2 - sizeof(void *));
1230}
1231EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1232
1233static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1234		int family)
1235{
1236	struct sock *sk;
1237	struct kmem_cache *slab;
1238
1239	slab = prot->slab;
1240	if (slab != NULL) {
1241		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1242		if (!sk)
1243			return sk;
1244		if (priority & __GFP_ZERO) {
1245			if (prot->clear_sk)
1246				prot->clear_sk(sk, prot->obj_size);
1247			else
1248				sk_prot_clear_nulls(sk, prot->obj_size);
1249		}
1250	} else
1251		sk = kmalloc(prot->obj_size, priority);
1252
1253	if (sk != NULL) {
1254		kmemcheck_annotate_bitfield(sk, flags);
1255
1256		if (security_sk_alloc(sk, family, priority))
1257			goto out_free;
1258
1259		if (!try_module_get(prot->owner))
1260			goto out_free_sec;
1261		sk_tx_queue_clear(sk);
1262	}
1263
1264	return sk;
1265
1266out_free_sec:
1267	security_sk_free(sk);
1268out_free:
1269	if (slab != NULL)
1270		kmem_cache_free(slab, sk);
1271	else
1272		kfree(sk);
1273	return NULL;
1274}
1275
1276static void sk_prot_free(struct proto *prot, struct sock *sk)
1277{
1278	struct kmem_cache *slab;
1279	struct module *owner;
1280
1281	owner = prot->owner;
1282	slab = prot->slab;
1283
1284	security_sk_free(sk);
1285	if (slab != NULL)
1286		kmem_cache_free(slab, sk);
1287	else
1288		kfree(sk);
1289	module_put(owner);
1290}
1291
1292#ifdef CONFIG_CGROUPS
1293#if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
1294void sock_update_classid(struct sock *sk, struct task_struct *task)
1295{
1296	u32 classid;
1297
1298	classid = task_cls_classid(task);
1299	if (classid != sk->sk_classid)
1300		sk->sk_classid = classid;
1301}
1302EXPORT_SYMBOL(sock_update_classid);
1303#endif
1304
1305#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1306void sock_update_netprioidx(struct sock *sk, struct task_struct *task)
1307{
1308	if (in_interrupt())
1309		return;
1310
1311	sk->sk_cgrp_prioidx = task_netprioidx(task);
1312}
1313EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1314#endif
1315#endif
1316
1317/**
1318 *	sk_alloc - All socket objects are allocated here
1319 *	@net: the applicable net namespace
1320 *	@family: protocol family
1321 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1322 *	@prot: struct proto associated with this new sock instance
1323 */
1324struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1325		      struct proto *prot)
1326{
1327	struct sock *sk;
1328
1329	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1330	if (sk) {
1331		sk->sk_family = family;
1332		/*
1333		 * See comment in struct sock definition to understand
1334		 * why we need sk_prot_creator -acme
1335		 */
1336		sk->sk_prot = sk->sk_prot_creator = prot;
1337		sock_lock_init(sk);
1338		sock_net_set(sk, get_net(net));
1339		atomic_set(&sk->sk_wmem_alloc, 1);
1340
1341		sock_update_classid(sk, current);
1342		sock_update_netprioidx(sk, current);
1343	}
1344
1345	return sk;
1346}
1347EXPORT_SYMBOL(sk_alloc);
1348
1349static void __sk_free(struct sock *sk)
1350{
1351	struct sk_filter *filter;
1352
1353	if (sk->sk_destruct)
1354		sk->sk_destruct(sk);
1355
1356	filter = rcu_dereference_check(sk->sk_filter,
1357				       atomic_read(&sk->sk_wmem_alloc) == 0);
1358	if (filter) {
1359		sk_filter_uncharge(sk, filter);
1360		RCU_INIT_POINTER(sk->sk_filter, NULL);
1361	}
1362
1363	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1364
1365	if (atomic_read(&sk->sk_omem_alloc))
1366		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1367			 __func__, atomic_read(&sk->sk_omem_alloc));
1368
1369	if (sk->sk_peer_cred)
1370		put_cred(sk->sk_peer_cred);
1371	put_pid(sk->sk_peer_pid);
1372	put_net(sock_net(sk));
1373	sk_prot_free(sk->sk_prot_creator, sk);
1374}
1375
1376void sk_free(struct sock *sk)
1377{
1378	/*
1379	 * We subtract one from sk_wmem_alloc and can know if
1380	 * some packets are still in some tx queue.
1381	 * If not null, sock_wfree() will call __sk_free(sk) later
1382	 */
1383	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1384		__sk_free(sk);
1385}
1386EXPORT_SYMBOL(sk_free);
1387
1388/*
1389 * Last sock_put should drop reference to sk->sk_net. It has already
1390 * been dropped in sk_change_net. Taking reference to stopping namespace
1391 * is not an option.
1392 * Take reference to a socket to remove it from hash _alive_ and after that
1393 * destroy it in the context of init_net.
1394 */
1395void sk_release_kernel(struct sock *sk)
1396{
1397	if (sk == NULL || sk->sk_socket == NULL)
1398		return;
1399
1400	sock_hold(sk);
1401	sock_release(sk->sk_socket);
1402	release_net(sock_net(sk));
1403	sock_net_set(sk, get_net(&init_net));
1404	sock_put(sk);
1405}
1406EXPORT_SYMBOL(sk_release_kernel);
1407
1408static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1409{
1410	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1411		sock_update_memcg(newsk);
1412}
1413
1414/**
1415 *	sk_clone_lock - clone a socket, and lock its clone
1416 *	@sk: the socket to clone
1417 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1418 *
1419 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1420 */
1421struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1422{
1423	struct sock *newsk;
1424
1425	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1426	if (newsk != NULL) {
1427		struct sk_filter *filter;
1428
1429		sock_copy(newsk, sk);
1430
1431		/* SANITY */
1432		get_net(sock_net(newsk));
1433		sk_node_init(&newsk->sk_node);
1434		sock_lock_init(newsk);
1435		bh_lock_sock(newsk);
1436		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1437		newsk->sk_backlog.len = 0;
1438
1439		atomic_set(&newsk->sk_rmem_alloc, 0);
1440		/*
1441		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1442		 */
1443		atomic_set(&newsk->sk_wmem_alloc, 1);
1444		atomic_set(&newsk->sk_omem_alloc, 0);
1445		skb_queue_head_init(&newsk->sk_receive_queue);
1446		skb_queue_head_init(&newsk->sk_write_queue);
1447#ifdef CONFIG_NET_DMA
1448		skb_queue_head_init(&newsk->sk_async_wait_queue);
1449#endif
1450
1451		spin_lock_init(&newsk->sk_dst_lock);
1452		rwlock_init(&newsk->sk_callback_lock);
1453		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1454				af_callback_keys + newsk->sk_family,
1455				af_family_clock_key_strings[newsk->sk_family]);
1456
1457		newsk->sk_dst_cache	= NULL;
1458		newsk->sk_wmem_queued	= 0;
1459		newsk->sk_forward_alloc = 0;
1460		newsk->sk_send_head	= NULL;
1461		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1462
1463		sock_reset_flag(newsk, SOCK_DONE);
1464		skb_queue_head_init(&newsk->sk_error_queue);
1465
1466		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1467		if (filter != NULL)
1468			sk_filter_charge(newsk, filter);
1469
1470		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1471			/* It is still raw copy of parent, so invalidate
1472			 * destructor and make plain sk_free() */
1473			newsk->sk_destruct = NULL;
1474			bh_unlock_sock(newsk);
1475			sk_free(newsk);
1476			newsk = NULL;
1477			goto out;
1478		}
1479
1480		newsk->sk_err	   = 0;
1481		newsk->sk_priority = 0;
1482		/*
1483		 * Before updating sk_refcnt, we must commit prior changes to memory
1484		 * (Documentation/RCU/rculist_nulls.txt for details)
1485		 */
1486		smp_wmb();
1487		atomic_set(&newsk->sk_refcnt, 2);
1488
1489		/*
1490		 * Increment the counter in the same struct proto as the master
1491		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1492		 * is the same as sk->sk_prot->socks, as this field was copied
1493		 * with memcpy).
1494		 *
1495		 * This _changes_ the previous behaviour, where
1496		 * tcp_create_openreq_child always was incrementing the
1497		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1498		 * to be taken into account in all callers. -acme
1499		 */
1500		sk_refcnt_debug_inc(newsk);
1501		sk_set_socket(newsk, NULL);
1502		newsk->sk_wq = NULL;
1503
1504		sk_update_clone(sk, newsk);
1505
1506		if (newsk->sk_prot->sockets_allocated)
1507			sk_sockets_allocated_inc(newsk);
1508
1509		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1510			net_enable_timestamp();
1511	}
1512out:
1513	return newsk;
1514}
1515EXPORT_SYMBOL_GPL(sk_clone_lock);
1516
1517void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1518{
1519	__sk_dst_set(sk, dst);
1520	sk->sk_route_caps = dst->dev->features;
1521	if (sk->sk_route_caps & NETIF_F_GSO)
1522		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1523	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1524	if (sk_can_gso(sk)) {
1525		if (dst->header_len) {
1526			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1527		} else {
1528			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1529			sk->sk_gso_max_size = dst->dev->gso_max_size;
1530			sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1531		}
1532	}
1533}
1534EXPORT_SYMBOL_GPL(sk_setup_caps);
1535
1536/*
1537 *	Simple resource managers for sockets.
1538 */
1539
1540
1541/*
1542 * Write buffer destructor automatically called from kfree_skb.
1543 */
1544void sock_wfree(struct sk_buff *skb)
1545{
1546	struct sock *sk = skb->sk;
1547	unsigned int len = skb->truesize;
1548
1549	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1550		/*
1551		 * Keep a reference on sk_wmem_alloc, this will be released
1552		 * after sk_write_space() call
1553		 */
1554		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1555		sk->sk_write_space(sk);
1556		len = 1;
1557	}
1558	/*
1559	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1560	 * could not do because of in-flight packets
1561	 */
1562	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1563		__sk_free(sk);
1564}
1565EXPORT_SYMBOL(sock_wfree);
1566
1567/*
1568 * Read buffer destructor automatically called from kfree_skb.
1569 */
1570void sock_rfree(struct sk_buff *skb)
1571{
1572	struct sock *sk = skb->sk;
1573	unsigned int len = skb->truesize;
1574
1575	atomic_sub(len, &sk->sk_rmem_alloc);
1576	sk_mem_uncharge(sk, len);
1577}
1578EXPORT_SYMBOL(sock_rfree);
1579
1580void sock_edemux(struct sk_buff *skb)
1581{
1582	struct sock *sk = skb->sk;
1583
1584#ifdef CONFIG_INET
1585	if (sk->sk_state == TCP_TIME_WAIT)
1586		inet_twsk_put(inet_twsk(sk));
1587	else
1588#endif
1589		sock_put(sk);
1590}
1591EXPORT_SYMBOL(sock_edemux);
1592
1593kuid_t sock_i_uid(struct sock *sk)
1594{
1595	kuid_t uid;
1596
1597	read_lock_bh(&sk->sk_callback_lock);
1598	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1599	read_unlock_bh(&sk->sk_callback_lock);
1600	return uid;
1601}
1602EXPORT_SYMBOL(sock_i_uid);
1603
1604unsigned long sock_i_ino(struct sock *sk)
1605{
1606	unsigned long ino;
1607
1608	read_lock_bh(&sk->sk_callback_lock);
1609	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1610	read_unlock_bh(&sk->sk_callback_lock);
1611	return ino;
1612}
1613EXPORT_SYMBOL(sock_i_ino);
1614
1615/*
1616 * Allocate a skb from the socket's send buffer.
1617 */
1618struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1619			     gfp_t priority)
1620{
1621	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1622		struct sk_buff *skb = alloc_skb(size, priority);
1623		if (skb) {
1624			skb_set_owner_w(skb, sk);
1625			return skb;
1626		}
1627	}
1628	return NULL;
1629}
1630EXPORT_SYMBOL(sock_wmalloc);
1631
1632/*
1633 * Allocate a skb from the socket's receive buffer.
1634 */
1635struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1636			     gfp_t priority)
1637{
1638	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1639		struct sk_buff *skb = alloc_skb(size, priority);
1640		if (skb) {
1641			skb_set_owner_r(skb, sk);
1642			return skb;
1643		}
1644	}
1645	return NULL;
1646}
1647
1648/*
1649 * Allocate a memory block from the socket's option memory buffer.
1650 */
1651void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1652{
1653	if ((unsigned int)size <= sysctl_optmem_max &&
1654	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1655		void *mem;
1656		/* First do the add, to avoid the race if kmalloc
1657		 * might sleep.
1658		 */
1659		atomic_add(size, &sk->sk_omem_alloc);
1660		mem = kmalloc(size, priority);
1661		if (mem)
1662			return mem;
1663		atomic_sub(size, &sk->sk_omem_alloc);
1664	}
1665	return NULL;
1666}
1667EXPORT_SYMBOL(sock_kmalloc);
1668
1669/*
1670 * Free an option memory block.
1671 */
1672void sock_kfree_s(struct sock *sk, void *mem, int size)
1673{
1674	kfree(mem);
1675	atomic_sub(size, &sk->sk_omem_alloc);
1676}
1677EXPORT_SYMBOL(sock_kfree_s);
1678
1679/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1680   I think, these locks should be removed for datagram sockets.
1681 */
1682static long sock_wait_for_wmem(struct sock *sk, long timeo)
1683{
1684	DEFINE_WAIT(wait);
1685
1686	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1687	for (;;) {
1688		if (!timeo)
1689			break;
1690		if (signal_pending(current))
1691			break;
1692		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1693		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1694		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1695			break;
1696		if (sk->sk_shutdown & SEND_SHUTDOWN)
1697			break;
1698		if (sk->sk_err)
1699			break;
1700		timeo = schedule_timeout(timeo);
1701	}
1702	finish_wait(sk_sleep(sk), &wait);
1703	return timeo;
1704}
1705
1706
1707/*
1708 *	Generic send/receive buffer handlers
1709 */
1710
1711struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1712				     unsigned long data_len, int noblock,
1713				     int *errcode)
1714{
1715	struct sk_buff *skb;
1716	gfp_t gfp_mask;
1717	long timeo;
1718	int err;
1719	int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1720
1721	err = -EMSGSIZE;
1722	if (npages > MAX_SKB_FRAGS)
1723		goto failure;
1724
1725	gfp_mask = sk->sk_allocation;
1726	if (gfp_mask & __GFP_WAIT)
1727		gfp_mask |= __GFP_REPEAT;
1728
1729	timeo = sock_sndtimeo(sk, noblock);
1730	while (1) {
1731		err = sock_error(sk);
1732		if (err != 0)
1733			goto failure;
1734
1735		err = -EPIPE;
1736		if (sk->sk_shutdown & SEND_SHUTDOWN)
1737			goto failure;
1738
1739		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1740			skb = alloc_skb(header_len, gfp_mask);
1741			if (skb) {
1742				int i;
1743
1744				/* No pages, we're done... */
1745				if (!data_len)
1746					break;
1747
1748				skb->truesize += data_len;
1749				skb_shinfo(skb)->nr_frags = npages;
1750				for (i = 0; i < npages; i++) {
1751					struct page *page;
1752
1753					page = alloc_pages(sk->sk_allocation, 0);
1754					if (!page) {
1755						err = -ENOBUFS;
1756						skb_shinfo(skb)->nr_frags = i;
1757						kfree_skb(skb);
1758						goto failure;
1759					}
1760
1761					__skb_fill_page_desc(skb, i,
1762							page, 0,
1763							(data_len >= PAGE_SIZE ?
1764							 PAGE_SIZE :
1765							 data_len));
1766					data_len -= PAGE_SIZE;
1767				}
1768
1769				/* Full success... */
1770				break;
1771			}
1772			err = -ENOBUFS;
1773			goto failure;
1774		}
1775		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1776		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1777		err = -EAGAIN;
1778		if (!timeo)
1779			goto failure;
1780		if (signal_pending(current))
1781			goto interrupted;
1782		timeo = sock_wait_for_wmem(sk, timeo);
1783	}
1784
1785	skb_set_owner_w(skb, sk);
1786	return skb;
1787
1788interrupted:
1789	err = sock_intr_errno(timeo);
1790failure:
1791	*errcode = err;
1792	return NULL;
1793}
1794EXPORT_SYMBOL(sock_alloc_send_pskb);
1795
1796struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1797				    int noblock, int *errcode)
1798{
1799	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1800}
1801EXPORT_SYMBOL(sock_alloc_send_skb);
1802
1803/* On 32bit arches, an skb frag is limited to 2^15 */
1804#define SKB_FRAG_PAGE_ORDER	get_order(32768)
1805
1806bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1807{
1808	int order;
1809
1810	if (pfrag->page) {
1811		if (atomic_read(&pfrag->page->_count) == 1) {
1812			pfrag->offset = 0;
1813			return true;
1814		}
1815		if (pfrag->offset < pfrag->size)
1816			return true;
1817		put_page(pfrag->page);
1818	}
1819
1820	/* We restrict high order allocations to users that can afford to wait */
1821	order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1822
1823	do {
1824		gfp_t gfp = sk->sk_allocation;
1825
1826		if (order)
1827			gfp |= __GFP_COMP | __GFP_NOWARN;
1828		pfrag->page = alloc_pages(gfp, order);
1829		if (likely(pfrag->page)) {
1830			pfrag->offset = 0;
1831			pfrag->size = PAGE_SIZE << order;
1832			return true;
1833		}
1834	} while (--order >= 0);
1835
1836	sk_enter_memory_pressure(sk);
1837	sk_stream_moderate_sndbuf(sk);
1838	return false;
1839}
1840EXPORT_SYMBOL(sk_page_frag_refill);
1841
1842static void __lock_sock(struct sock *sk)
1843	__releases(&sk->sk_lock.slock)
1844	__acquires(&sk->sk_lock.slock)
1845{
1846	DEFINE_WAIT(wait);
1847
1848	for (;;) {
1849		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1850					TASK_UNINTERRUPTIBLE);
1851		spin_unlock_bh(&sk->sk_lock.slock);
1852		schedule();
1853		spin_lock_bh(&sk->sk_lock.slock);
1854		if (!sock_owned_by_user(sk))
1855			break;
1856	}
1857	finish_wait(&sk->sk_lock.wq, &wait);
1858}
1859
1860static void __release_sock(struct sock *sk)
1861	__releases(&sk->sk_lock.slock)
1862	__acquires(&sk->sk_lock.slock)
1863{
1864	struct sk_buff *skb = sk->sk_backlog.head;
1865
1866	do {
1867		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1868		bh_unlock_sock(sk);
1869
1870		do {
1871			struct sk_buff *next = skb->next;
1872
1873			prefetch(next);
1874			WARN_ON_ONCE(skb_dst_is_noref(skb));
1875			skb->next = NULL;
1876			sk_backlog_rcv(sk, skb);
1877
1878			/*
1879			 * We are in process context here with softirqs
1880			 * disabled, use cond_resched_softirq() to preempt.
1881			 * This is safe to do because we've taken the backlog
1882			 * queue private:
1883			 */
1884			cond_resched_softirq();
1885
1886			skb = next;
1887		} while (skb != NULL);
1888
1889		bh_lock_sock(sk);
1890	} while ((skb = sk->sk_backlog.head) != NULL);
1891
1892	/*
1893	 * Doing the zeroing here guarantee we can not loop forever
1894	 * while a wild producer attempts to flood us.
1895	 */
1896	sk->sk_backlog.len = 0;
1897}
1898
1899/**
1900 * sk_wait_data - wait for data to arrive at sk_receive_queue
1901 * @sk:    sock to wait on
1902 * @timeo: for how long
1903 *
1904 * Now socket state including sk->sk_err is changed only under lock,
1905 * hence we may omit checks after joining wait queue.
1906 * We check receive queue before schedule() only as optimization;
1907 * it is very likely that release_sock() added new data.
1908 */
1909int sk_wait_data(struct sock *sk, long *timeo)
1910{
1911	int rc;
1912	DEFINE_WAIT(wait);
1913
1914	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1915	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1916	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1917	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1918	finish_wait(sk_sleep(sk), &wait);
1919	return rc;
1920}
1921EXPORT_SYMBOL(sk_wait_data);
1922
1923/**
1924 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1925 *	@sk: socket
1926 *	@size: memory size to allocate
1927 *	@kind: allocation type
1928 *
1929 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1930 *	rmem allocation. This function assumes that protocols which have
1931 *	memory_pressure use sk_wmem_queued as write buffer accounting.
1932 */
1933int __sk_mem_schedule(struct sock *sk, int size, int kind)
1934{
1935	struct proto *prot = sk->sk_prot;
1936	int amt = sk_mem_pages(size);
1937	long allocated;
1938	int parent_status = UNDER_LIMIT;
1939
1940	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1941
1942	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1943
1944	/* Under limit. */
1945	if (parent_status == UNDER_LIMIT &&
1946			allocated <= sk_prot_mem_limits(sk, 0)) {
1947		sk_leave_memory_pressure(sk);
1948		return 1;
1949	}
1950
1951	/* Under pressure. (we or our parents) */
1952	if ((parent_status > SOFT_LIMIT) ||
1953			allocated > sk_prot_mem_limits(sk, 1))
1954		sk_enter_memory_pressure(sk);
1955
1956	/* Over hard limit (we or our parents) */
1957	if ((parent_status == OVER_LIMIT) ||
1958			(allocated > sk_prot_mem_limits(sk, 2)))
1959		goto suppress_allocation;
1960
1961	/* guarantee minimum buffer size under pressure */
1962	if (kind == SK_MEM_RECV) {
1963		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1964			return 1;
1965
1966	} else { /* SK_MEM_SEND */
1967		if (sk->sk_type == SOCK_STREAM) {
1968			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1969				return 1;
1970		} else if (atomic_read(&sk->sk_wmem_alloc) <
1971			   prot->sysctl_wmem[0])
1972				return 1;
1973	}
1974
1975	if (sk_has_memory_pressure(sk)) {
1976		int alloc;
1977
1978		if (!sk_under_memory_pressure(sk))
1979			return 1;
1980		alloc = sk_sockets_allocated_read_positive(sk);
1981		if (sk_prot_mem_limits(sk, 2) > alloc *
1982		    sk_mem_pages(sk->sk_wmem_queued +
1983				 atomic_read(&sk->sk_rmem_alloc) +
1984				 sk->sk_forward_alloc))
1985			return 1;
1986	}
1987
1988suppress_allocation:
1989
1990	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1991		sk_stream_moderate_sndbuf(sk);
1992
1993		/* Fail only if socket is _under_ its sndbuf.
1994		 * In this case we cannot block, so that we have to fail.
1995		 */
1996		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1997			return 1;
1998	}
1999
2000	trace_sock_exceed_buf_limit(sk, prot, allocated);
2001
2002	/* Alas. Undo changes. */
2003	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2004
2005	sk_memory_allocated_sub(sk, amt);
2006
2007	return 0;
2008}
2009EXPORT_SYMBOL(__sk_mem_schedule);
2010
2011/**
2012 *	__sk_reclaim - reclaim memory_allocated
2013 *	@sk: socket
2014 */
2015void __sk_mem_reclaim(struct sock *sk)
2016{
2017	sk_memory_allocated_sub(sk,
2018				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2019	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2020
2021	if (sk_under_memory_pressure(sk) &&
2022	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2023		sk_leave_memory_pressure(sk);
2024}
2025EXPORT_SYMBOL(__sk_mem_reclaim);
2026
2027
2028/*
2029 * Set of default routines for initialising struct proto_ops when
2030 * the protocol does not support a particular function. In certain
2031 * cases where it makes no sense for a protocol to have a "do nothing"
2032 * function, some default processing is provided.
2033 */
2034
2035int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2036{
2037	return -EOPNOTSUPP;
2038}
2039EXPORT_SYMBOL(sock_no_bind);
2040
2041int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2042		    int len, int flags)
2043{
2044	return -EOPNOTSUPP;
2045}
2046EXPORT_SYMBOL(sock_no_connect);
2047
2048int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2049{
2050	return -EOPNOTSUPP;
2051}
2052EXPORT_SYMBOL(sock_no_socketpair);
2053
2054int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2055{
2056	return -EOPNOTSUPP;
2057}
2058EXPORT_SYMBOL(sock_no_accept);
2059
2060int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2061		    int *len, int peer)
2062{
2063	return -EOPNOTSUPP;
2064}
2065EXPORT_SYMBOL(sock_no_getname);
2066
2067unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2068{
2069	return 0;
2070}
2071EXPORT_SYMBOL(sock_no_poll);
2072
2073int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2074{
2075	return -EOPNOTSUPP;
2076}
2077EXPORT_SYMBOL(sock_no_ioctl);
2078
2079int sock_no_listen(struct socket *sock, int backlog)
2080{
2081	return -EOPNOTSUPP;
2082}
2083EXPORT_SYMBOL(sock_no_listen);
2084
2085int sock_no_shutdown(struct socket *sock, int how)
2086{
2087	return -EOPNOTSUPP;
2088}
2089EXPORT_SYMBOL(sock_no_shutdown);
2090
2091int sock_no_setsockopt(struct socket *sock, int level, int optname,
2092		    char __user *optval, unsigned int optlen)
2093{
2094	return -EOPNOTSUPP;
2095}
2096EXPORT_SYMBOL(sock_no_setsockopt);
2097
2098int sock_no_getsockopt(struct socket *sock, int level, int optname,
2099		    char __user *optval, int __user *optlen)
2100{
2101	return -EOPNOTSUPP;
2102}
2103EXPORT_SYMBOL(sock_no_getsockopt);
2104
2105int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2106		    size_t len)
2107{
2108	return -EOPNOTSUPP;
2109}
2110EXPORT_SYMBOL(sock_no_sendmsg);
2111
2112int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2113		    size_t len, int flags)
2114{
2115	return -EOPNOTSUPP;
2116}
2117EXPORT_SYMBOL(sock_no_recvmsg);
2118
2119int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2120{
2121	/* Mirror missing mmap method error code */
2122	return -ENODEV;
2123}
2124EXPORT_SYMBOL(sock_no_mmap);
2125
2126ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2127{
2128	ssize_t res;
2129	struct msghdr msg = {.msg_flags = flags};
2130	struct kvec iov;
2131	char *kaddr = kmap(page);
2132	iov.iov_base = kaddr + offset;
2133	iov.iov_len = size;
2134	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2135	kunmap(page);
2136	return res;
2137}
2138EXPORT_SYMBOL(sock_no_sendpage);
2139
2140/*
2141 *	Default Socket Callbacks
2142 */
2143
2144static void sock_def_wakeup(struct sock *sk)
2145{
2146	struct socket_wq *wq;
2147
2148	rcu_read_lock();
2149	wq = rcu_dereference(sk->sk_wq);
2150	if (wq_has_sleeper(wq))
2151		wake_up_interruptible_all(&wq->wait);
2152	rcu_read_unlock();
2153}
2154
2155static void sock_def_error_report(struct sock *sk)
2156{
2157	struct socket_wq *wq;
2158
2159	rcu_read_lock();
2160	wq = rcu_dereference(sk->sk_wq);
2161	if (wq_has_sleeper(wq))
2162		wake_up_interruptible_poll(&wq->wait, POLLERR);
2163	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2164	rcu_read_unlock();
2165}
2166
2167static void sock_def_readable(struct sock *sk, int len)
2168{
2169	struct socket_wq *wq;
2170
2171	rcu_read_lock();
2172	wq = rcu_dereference(sk->sk_wq);
2173	if (wq_has_sleeper(wq))
2174		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2175						POLLRDNORM | POLLRDBAND);
2176	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2177	rcu_read_unlock();
2178}
2179
2180static void sock_def_write_space(struct sock *sk)
2181{
2182	struct socket_wq *wq;
2183
2184	rcu_read_lock();
2185
2186	/* Do not wake up a writer until he can make "significant"
2187	 * progress.  --DaveM
2188	 */
2189	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2190		wq = rcu_dereference(sk->sk_wq);
2191		if (wq_has_sleeper(wq))
2192			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2193						POLLWRNORM | POLLWRBAND);
2194
2195		/* Should agree with poll, otherwise some programs break */
2196		if (sock_writeable(sk))
2197			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2198	}
2199
2200	rcu_read_unlock();
2201}
2202
2203static void sock_def_destruct(struct sock *sk)
2204{
2205	kfree(sk->sk_protinfo);
2206}
2207
2208void sk_send_sigurg(struct sock *sk)
2209{
2210	if (sk->sk_socket && sk->sk_socket->file)
2211		if (send_sigurg(&sk->sk_socket->file->f_owner))
2212			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2213}
2214EXPORT_SYMBOL(sk_send_sigurg);
2215
2216void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2217		    unsigned long expires)
2218{
2219	if (!mod_timer(timer, expires))
2220		sock_hold(sk);
2221}
2222EXPORT_SYMBOL(sk_reset_timer);
2223
2224void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2225{
2226	if (timer_pending(timer) && del_timer(timer))
2227		__sock_put(sk);
2228}
2229EXPORT_SYMBOL(sk_stop_timer);
2230
2231void sock_init_data(struct socket *sock, struct sock *sk)
2232{
2233	skb_queue_head_init(&sk->sk_receive_queue);
2234	skb_queue_head_init(&sk->sk_write_queue);
2235	skb_queue_head_init(&sk->sk_error_queue);
2236#ifdef CONFIG_NET_DMA
2237	skb_queue_head_init(&sk->sk_async_wait_queue);
2238#endif
2239
2240	sk->sk_send_head	=	NULL;
2241
2242	init_timer(&sk->sk_timer);
2243
2244	sk->sk_allocation	=	GFP_KERNEL;
2245	sk->sk_rcvbuf		=	sysctl_rmem_default;
2246	sk->sk_sndbuf		=	sysctl_wmem_default;
2247	sk->sk_state		=	TCP_CLOSE;
2248	sk_set_socket(sk, sock);
2249
2250	sock_set_flag(sk, SOCK_ZAPPED);
2251
2252	if (sock) {
2253		sk->sk_type	=	sock->type;
2254		sk->sk_wq	=	sock->wq;
2255		sock->sk	=	sk;
2256	} else
2257		sk->sk_wq	=	NULL;
2258
2259	spin_lock_init(&sk->sk_dst_lock);
2260	rwlock_init(&sk->sk_callback_lock);
2261	lockdep_set_class_and_name(&sk->sk_callback_lock,
2262			af_callback_keys + sk->sk_family,
2263			af_family_clock_key_strings[sk->sk_family]);
2264
2265	sk->sk_state_change	=	sock_def_wakeup;
2266	sk->sk_data_ready	=	sock_def_readable;
2267	sk->sk_write_space	=	sock_def_write_space;
2268	sk->sk_error_report	=	sock_def_error_report;
2269	sk->sk_destruct		=	sock_def_destruct;
2270
2271	sk->sk_frag.page	=	NULL;
2272	sk->sk_frag.offset	=	0;
2273	sk->sk_peek_off		=	-1;
2274
2275	sk->sk_peer_pid 	=	NULL;
2276	sk->sk_peer_cred	=	NULL;
2277	sk->sk_write_pending	=	0;
2278	sk->sk_rcvlowat		=	1;
2279	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2280	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2281
2282	sk->sk_stamp = ktime_set(-1L, 0);
2283
2284	/*
2285	 * Before updating sk_refcnt, we must commit prior changes to memory
2286	 * (Documentation/RCU/rculist_nulls.txt for details)
2287	 */
2288	smp_wmb();
2289	atomic_set(&sk->sk_refcnt, 1);
2290	atomic_set(&sk->sk_drops, 0);
2291}
2292EXPORT_SYMBOL(sock_init_data);
2293
2294void lock_sock_nested(struct sock *sk, int subclass)
2295{
2296	might_sleep();
2297	spin_lock_bh(&sk->sk_lock.slock);
2298	if (sk->sk_lock.owned)
2299		__lock_sock(sk);
2300	sk->sk_lock.owned = 1;
2301	spin_unlock(&sk->sk_lock.slock);
2302	/*
2303	 * The sk_lock has mutex_lock() semantics here:
2304	 */
2305	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2306	local_bh_enable();
2307}
2308EXPORT_SYMBOL(lock_sock_nested);
2309
2310void release_sock(struct sock *sk)
2311{
2312	/*
2313	 * The sk_lock has mutex_unlock() semantics:
2314	 */
2315	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2316
2317	spin_lock_bh(&sk->sk_lock.slock);
2318	if (sk->sk_backlog.tail)
2319		__release_sock(sk);
2320
2321	if (sk->sk_prot->release_cb)
2322		sk->sk_prot->release_cb(sk);
2323
2324	sk->sk_lock.owned = 0;
2325	if (waitqueue_active(&sk->sk_lock.wq))
2326		wake_up(&sk->sk_lock.wq);
2327	spin_unlock_bh(&sk->sk_lock.slock);
2328}
2329EXPORT_SYMBOL(release_sock);
2330
2331/**
2332 * lock_sock_fast - fast version of lock_sock
2333 * @sk: socket
2334 *
2335 * This version should be used for very small section, where process wont block
2336 * return false if fast path is taken
2337 *   sk_lock.slock locked, owned = 0, BH disabled
2338 * return true if slow path is taken
2339 *   sk_lock.slock unlocked, owned = 1, BH enabled
2340 */
2341bool lock_sock_fast(struct sock *sk)
2342{
2343	might_sleep();
2344	spin_lock_bh(&sk->sk_lock.slock);
2345
2346	if (!sk->sk_lock.owned)
2347		/*
2348		 * Note : We must disable BH
2349		 */
2350		return false;
2351
2352	__lock_sock(sk);
2353	sk->sk_lock.owned = 1;
2354	spin_unlock(&sk->sk_lock.slock);
2355	/*
2356	 * The sk_lock has mutex_lock() semantics here:
2357	 */
2358	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2359	local_bh_enable();
2360	return true;
2361}
2362EXPORT_SYMBOL(lock_sock_fast);
2363
2364int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2365{
2366	struct timeval tv;
2367	if (!sock_flag(sk, SOCK_TIMESTAMP))
2368		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2369	tv = ktime_to_timeval(sk->sk_stamp);
2370	if (tv.tv_sec == -1)
2371		return -ENOENT;
2372	if (tv.tv_sec == 0) {
2373		sk->sk_stamp = ktime_get_real();
2374		tv = ktime_to_timeval(sk->sk_stamp);
2375	}
2376	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2377}
2378EXPORT_SYMBOL(sock_get_timestamp);
2379
2380int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2381{
2382	struct timespec ts;
2383	if (!sock_flag(sk, SOCK_TIMESTAMP))
2384		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2385	ts = ktime_to_timespec(sk->sk_stamp);
2386	if (ts.tv_sec == -1)
2387		return -ENOENT;
2388	if (ts.tv_sec == 0) {
2389		sk->sk_stamp = ktime_get_real();
2390		ts = ktime_to_timespec(sk->sk_stamp);
2391	}
2392	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2393}
2394EXPORT_SYMBOL(sock_get_timestampns);
2395
2396void sock_enable_timestamp(struct sock *sk, int flag)
2397{
2398	if (!sock_flag(sk, flag)) {
2399		unsigned long previous_flags = sk->sk_flags;
2400
2401		sock_set_flag(sk, flag);
2402		/*
2403		 * we just set one of the two flags which require net
2404		 * time stamping, but time stamping might have been on
2405		 * already because of the other one
2406		 */
2407		if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2408			net_enable_timestamp();
2409	}
2410}
2411
2412/*
2413 *	Get a socket option on an socket.
2414 *
2415 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2416 *	asynchronous errors should be reported by getsockopt. We assume
2417 *	this means if you specify SO_ERROR (otherwise whats the point of it).
2418 */
2419int sock_common_getsockopt(struct socket *sock, int level, int optname,
2420			   char __user *optval, int __user *optlen)
2421{
2422	struct sock *sk = sock->sk;
2423
2424	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2425}
2426EXPORT_SYMBOL(sock_common_getsockopt);
2427
2428#ifdef CONFIG_COMPAT
2429int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2430				  char __user *optval, int __user *optlen)
2431{
2432	struct sock *sk = sock->sk;
2433
2434	if (sk->sk_prot->compat_getsockopt != NULL)
2435		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2436						      optval, optlen);
2437	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2438}
2439EXPORT_SYMBOL(compat_sock_common_getsockopt);
2440#endif
2441
2442int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2443			struct msghdr *msg, size_t size, int flags)
2444{
2445	struct sock *sk = sock->sk;
2446	int addr_len = 0;
2447	int err;
2448
2449	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2450				   flags & ~MSG_DONTWAIT, &addr_len);
2451	if (err >= 0)
2452		msg->msg_namelen = addr_len;
2453	return err;
2454}
2455EXPORT_SYMBOL(sock_common_recvmsg);
2456
2457/*
2458 *	Set socket options on an inet socket.
2459 */
2460int sock_common_setsockopt(struct socket *sock, int level, int optname,
2461			   char __user *optval, unsigned int optlen)
2462{
2463	struct sock *sk = sock->sk;
2464
2465	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2466}
2467EXPORT_SYMBOL(sock_common_setsockopt);
2468
2469#ifdef CONFIG_COMPAT
2470int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2471				  char __user *optval, unsigned int optlen)
2472{
2473	struct sock *sk = sock->sk;
2474
2475	if (sk->sk_prot->compat_setsockopt != NULL)
2476		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2477						      optval, optlen);
2478	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2479}
2480EXPORT_SYMBOL(compat_sock_common_setsockopt);
2481#endif
2482
2483void sk_common_release(struct sock *sk)
2484{
2485	if (sk->sk_prot->destroy)
2486		sk->sk_prot->destroy(sk);
2487
2488	/*
2489	 * Observation: when sock_common_release is called, processes have
2490	 * no access to socket. But net still has.
2491	 * Step one, detach it from networking:
2492	 *
2493	 * A. Remove from hash tables.
2494	 */
2495
2496	sk->sk_prot->unhash(sk);
2497
2498	/*
2499	 * In this point socket cannot receive new packets, but it is possible
2500	 * that some packets are in flight because some CPU runs receiver and
2501	 * did hash table lookup before we unhashed socket. They will achieve
2502	 * receive queue and will be purged by socket destructor.
2503	 *
2504	 * Also we still have packets pending on receive queue and probably,
2505	 * our own packets waiting in device queues. sock_destroy will drain
2506	 * receive queue, but transmitted packets will delay socket destruction
2507	 * until the last reference will be released.
2508	 */
2509
2510	sock_orphan(sk);
2511
2512	xfrm_sk_free_policy(sk);
2513
2514	sk_refcnt_debug_release(sk);
2515
2516	if (sk->sk_frag.page) {
2517		put_page(sk->sk_frag.page);
2518		sk->sk_frag.page = NULL;
2519	}
2520
2521	sock_put(sk);
2522}
2523EXPORT_SYMBOL(sk_common_release);
2524
2525#ifdef CONFIG_PROC_FS
2526#define PROTO_INUSE_NR	64	/* should be enough for the first time */
2527struct prot_inuse {
2528	int val[PROTO_INUSE_NR];
2529};
2530
2531static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2532
2533#ifdef CONFIG_NET_NS
2534void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2535{
2536	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2537}
2538EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2539
2540int sock_prot_inuse_get(struct net *net, struct proto *prot)
2541{
2542	int cpu, idx = prot->inuse_idx;
2543	int res = 0;
2544
2545	for_each_possible_cpu(cpu)
2546		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2547
2548	return res >= 0 ? res : 0;
2549}
2550EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2551
2552static int __net_init sock_inuse_init_net(struct net *net)
2553{
2554	net->core.inuse = alloc_percpu(struct prot_inuse);
2555	return net->core.inuse ? 0 : -ENOMEM;
2556}
2557
2558static void __net_exit sock_inuse_exit_net(struct net *net)
2559{
2560	free_percpu(net->core.inuse);
2561}
2562
2563static struct pernet_operations net_inuse_ops = {
2564	.init = sock_inuse_init_net,
2565	.exit = sock_inuse_exit_net,
2566};
2567
2568static __init int net_inuse_init(void)
2569{
2570	if (register_pernet_subsys(&net_inuse_ops))
2571		panic("Cannot initialize net inuse counters");
2572
2573	return 0;
2574}
2575
2576core_initcall(net_inuse_init);
2577#else
2578static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2579
2580void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2581{
2582	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2583}
2584EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2585
2586int sock_prot_inuse_get(struct net *net, struct proto *prot)
2587{
2588	int cpu, idx = prot->inuse_idx;
2589	int res = 0;
2590
2591	for_each_possible_cpu(cpu)
2592		res += per_cpu(prot_inuse, cpu).val[idx];
2593
2594	return res >= 0 ? res : 0;
2595}
2596EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2597#endif
2598
2599static void assign_proto_idx(struct proto *prot)
2600{
2601	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2602
2603	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2604		pr_err("PROTO_INUSE_NR exhausted\n");
2605		return;
2606	}
2607
2608	set_bit(prot->inuse_idx, proto_inuse_idx);
2609}
2610
2611static void release_proto_idx(struct proto *prot)
2612{
2613	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2614		clear_bit(prot->inuse_idx, proto_inuse_idx);
2615}
2616#else
2617static inline void assign_proto_idx(struct proto *prot)
2618{
2619}
2620
2621static inline void release_proto_idx(struct proto *prot)
2622{
2623}
2624#endif
2625
2626int proto_register(struct proto *prot, int alloc_slab)
2627{
2628	if (alloc_slab) {
2629		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2630					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2631					NULL);
2632
2633		if (prot->slab == NULL) {
2634			pr_crit("%s: Can't create sock SLAB cache!\n",
2635				prot->name);
2636			goto out;
2637		}
2638
2639		if (prot->rsk_prot != NULL) {
2640			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2641			if (prot->rsk_prot->slab_name == NULL)
2642				goto out_free_sock_slab;
2643
2644			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2645								 prot->rsk_prot->obj_size, 0,
2646								 SLAB_HWCACHE_ALIGN, NULL);
2647
2648			if (prot->rsk_prot->slab == NULL) {
2649				pr_crit("%s: Can't create request sock SLAB cache!\n",
2650					prot->name);
2651				goto out_free_request_sock_slab_name;
2652			}
2653		}
2654
2655		if (prot->twsk_prot != NULL) {
2656			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2657
2658			if (prot->twsk_prot->twsk_slab_name == NULL)
2659				goto out_free_request_sock_slab;
2660
2661			prot->twsk_prot->twsk_slab =
2662				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2663						  prot->twsk_prot->twsk_obj_size,
2664						  0,
2665						  SLAB_HWCACHE_ALIGN |
2666							prot->slab_flags,
2667						  NULL);
2668			if (prot->twsk_prot->twsk_slab == NULL)
2669				goto out_free_timewait_sock_slab_name;
2670		}
2671	}
2672
2673	mutex_lock(&proto_list_mutex);
2674	list_add(&prot->node, &proto_list);
2675	assign_proto_idx(prot);
2676	mutex_unlock(&proto_list_mutex);
2677	return 0;
2678
2679out_free_timewait_sock_slab_name:
2680	kfree(prot->twsk_prot->twsk_slab_name);
2681out_free_request_sock_slab:
2682	if (prot->rsk_prot && prot->rsk_prot->slab) {
2683		kmem_cache_destroy(prot->rsk_prot->slab);
2684		prot->rsk_prot->slab = NULL;
2685	}
2686out_free_request_sock_slab_name:
2687	if (prot->rsk_prot)
2688		kfree(prot->rsk_prot->slab_name);
2689out_free_sock_slab:
2690	kmem_cache_destroy(prot->slab);
2691	prot->slab = NULL;
2692out:
2693	return -ENOBUFS;
2694}
2695EXPORT_SYMBOL(proto_register);
2696
2697void proto_unregister(struct proto *prot)
2698{
2699	mutex_lock(&proto_list_mutex);
2700	release_proto_idx(prot);
2701	list_del(&prot->node);
2702	mutex_unlock(&proto_list_mutex);
2703
2704	if (prot->slab != NULL) {
2705		kmem_cache_destroy(prot->slab);
2706		prot->slab = NULL;
2707	}
2708
2709	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2710		kmem_cache_destroy(prot->rsk_prot->slab);
2711		kfree(prot->rsk_prot->slab_name);
2712		prot->rsk_prot->slab = NULL;
2713	}
2714
2715	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2716		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2717		kfree(prot->twsk_prot->twsk_slab_name);
2718		prot->twsk_prot->twsk_slab = NULL;
2719	}
2720}
2721EXPORT_SYMBOL(proto_unregister);
2722
2723#ifdef CONFIG_PROC_FS
2724static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2725	__acquires(proto_list_mutex)
2726{
2727	mutex_lock(&proto_list_mutex);
2728	return seq_list_start_head(&proto_list, *pos);
2729}
2730
2731static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2732{
2733	return seq_list_next(v, &proto_list, pos);
2734}
2735
2736static void proto_seq_stop(struct seq_file *seq, void *v)
2737	__releases(proto_list_mutex)
2738{
2739	mutex_unlock(&proto_list_mutex);
2740}
2741
2742static char proto_method_implemented(const void *method)
2743{
2744	return method == NULL ? 'n' : 'y';
2745}
2746static long sock_prot_memory_allocated(struct proto *proto)
2747{
2748	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2749}
2750
2751static char *sock_prot_memory_pressure(struct proto *proto)
2752{
2753	return proto->memory_pressure != NULL ?
2754	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2755}
2756
2757static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2758{
2759
2760	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2761			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2762		   proto->name,
2763		   proto->obj_size,
2764		   sock_prot_inuse_get(seq_file_net(seq), proto),
2765		   sock_prot_memory_allocated(proto),
2766		   sock_prot_memory_pressure(proto),
2767		   proto->max_header,
2768		   proto->slab == NULL ? "no" : "yes",
2769		   module_name(proto->owner),
2770		   proto_method_implemented(proto->close),
2771		   proto_method_implemented(proto->connect),
2772		   proto_method_implemented(proto->disconnect),
2773		   proto_method_implemented(proto->accept),
2774		   proto_method_implemented(proto->ioctl),
2775		   proto_method_implemented(proto->init),
2776		   proto_method_implemented(proto->destroy),
2777		   proto_method_implemented(proto->shutdown),
2778		   proto_method_implemented(proto->setsockopt),
2779		   proto_method_implemented(proto->getsockopt),
2780		   proto_method_implemented(proto->sendmsg),
2781		   proto_method_implemented(proto->recvmsg),
2782		   proto_method_implemented(proto->sendpage),
2783		   proto_method_implemented(proto->bind),
2784		   proto_method_implemented(proto->backlog_rcv),
2785		   proto_method_implemented(proto->hash),
2786		   proto_method_implemented(proto->unhash),
2787		   proto_method_implemented(proto->get_port),
2788		   proto_method_implemented(proto->enter_memory_pressure));
2789}
2790
2791static int proto_seq_show(struct seq_file *seq, void *v)
2792{
2793	if (v == &proto_list)
2794		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2795			   "protocol",
2796			   "size",
2797			   "sockets",
2798			   "memory",
2799			   "press",
2800			   "maxhdr",
2801			   "slab",
2802			   "module",
2803			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2804	else
2805		proto_seq_printf(seq, list_entry(v, struct proto, node));
2806	return 0;
2807}
2808
2809static const struct seq_operations proto_seq_ops = {
2810	.start  = proto_seq_start,
2811	.next   = proto_seq_next,
2812	.stop   = proto_seq_stop,
2813	.show   = proto_seq_show,
2814};
2815
2816static int proto_seq_open(struct inode *inode, struct file *file)
2817{
2818	return seq_open_net(inode, file, &proto_seq_ops,
2819			    sizeof(struct seq_net_private));
2820}
2821
2822static const struct file_operations proto_seq_fops = {
2823	.owner		= THIS_MODULE,
2824	.open		= proto_seq_open,
2825	.read		= seq_read,
2826	.llseek		= seq_lseek,
2827	.release	= seq_release_net,
2828};
2829
2830static __net_init int proto_init_net(struct net *net)
2831{
2832	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2833		return -ENOMEM;
2834
2835	return 0;
2836}
2837
2838static __net_exit void proto_exit_net(struct net *net)
2839{
2840	proc_net_remove(net, "protocols");
2841}
2842
2843
2844static __net_initdata struct pernet_operations proto_net_ops = {
2845	.init = proto_init_net,
2846	.exit = proto_exit_net,
2847};
2848
2849static int __init proto_init(void)
2850{
2851	return register_pernet_subsys(&proto_net_ops);
2852}
2853
2854subsys_initcall(proto_init);
2855
2856#endif /* PROC_FS */
2857