1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 *		Alan Cox	: 	Numerous verify_area() problems
17 *		Alan Cox	:	Connecting on a connecting socket
18 *					now returns an error for tcp.
19 *		Alan Cox	:	sock->protocol is set correctly.
20 *					and is not sometimes left as 0.
21 *		Alan Cox	:	connect handles icmp errors on a
22 *					connect properly. Unfortunately there
23 *					is a restart syscall nasty there. I
24 *					can't match BSD without hacking the C
25 *					library. Ideas urgently sought!
26 *		Alan Cox	:	Disallow bind() to addresses that are
27 *					not ours - especially broadcast ones!!
28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30 *					instead they leave that for the DESTROY timer.
31 *		Alan Cox	:	Clean up error flag in accept
32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33 *					was buggy. Put a remove_sock() in the handler
34 *					for memory when we hit 0. Also altered the timer
35 *					code. The ACK stuff can wait and needs major
36 *					TCP layer surgery.
37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38 *					and fixed timer/inet_bh race.
39 *		Alan Cox	:	Added zapped flag for TCP
40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47 *	Pauline Middelink	:	identd support
48 *		Alan Cox	:	Fixed connect() taking signals I think.
49 *		Alan Cox	:	SO_LINGER supported
50 *		Alan Cox	:	Error reporting fixes
51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52 *		Alan Cox	:	inet sockets don't set sk->type!
53 *		Alan Cox	:	Split socket option code
54 *		Alan Cox	:	Callbacks
55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56 *		Alex		:	Removed restriction on inet fioctl
57 *		Alan Cox	:	Splitting INET from NET core
58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60 *		Alan Cox	:	Split IP from generic code
61 *		Alan Cox	:	New kfree_skbmem()
62 *		Alan Cox	:	Make SO_DEBUG superuser only.
63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64 *					(compatibility fix)
65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66 *		Alan Cox	:	Allocator for a socket is settable.
67 *		Alan Cox	:	SO_ERROR includes soft errors.
68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69 *		Alan Cox	: 	Generic socket allocation to make hooks
70 *					easier (suggested by Craig Metz).
71 *		Michael Pall	:	SO_ERROR returns positive errno again
72 *              Steve Whitehouse:       Added default destructor to free
73 *                                      protocol private data.
74 *              Steve Whitehouse:       Added various other default routines
75 *                                      common to several socket families.
76 *              Chris Evans     :       Call suser() check last on F_SETOWN
77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79 *		Andi Kleen	:	Fix write_space callback
80 *		Chris Evans	:	Security fixes - signedness again
81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 *		This program is free software; you can redistribute it and/or
87 *		modify it under the terms of the GNU General Public License
88 *		as published by the Free Software Foundation; either version
89 *		2 of the License, or (at your option) any later version.
90 */
91
92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
94#include <linux/capability.h>
95#include <linux/errno.h>
96#include <linux/errqueue.h>
97#include <linux/types.h>
98#include <linux/socket.h>
99#include <linux/in.h>
100#include <linux/kernel.h>
101#include <linux/module.h>
102#include <linux/proc_fs.h>
103#include <linux/seq_file.h>
104#include <linux/sched.h>
105#include <linux/timer.h>
106#include <linux/string.h>
107#include <linux/sockios.h>
108#include <linux/net.h>
109#include <linux/mm.h>
110#include <linux/slab.h>
111#include <linux/interrupt.h>
112#include <linux/poll.h>
113#include <linux/tcp.h>
114#include <linux/init.h>
115#include <linux/highmem.h>
116#include <linux/user_namespace.h>
117#include <linux/static_key.h>
118#include <linux/memcontrol.h>
119#include <linux/prefetch.h>
120
121#include <asm/uaccess.h>
122
123#include <linux/netdevice.h>
124#include <net/protocol.h>
125#include <linux/skbuff.h>
126#include <net/net_namespace.h>
127#include <net/request_sock.h>
128#include <net/sock.h>
129#include <linux/net_tstamp.h>
130#include <net/xfrm.h>
131#include <linux/ipsec.h>
132#include <net/cls_cgroup.h>
133#include <net/netprio_cgroup.h>
134
135#include <linux/filter.h>
136
137#include <trace/events/sock.h>
138
139#ifdef CONFIG_INET
140#include <net/tcp.h>
141#endif
142
143#include <net/busy_poll.h>
144
145static DEFINE_MUTEX(proto_list_mutex);
146static LIST_HEAD(proto_list);
147
148/**
149 * sk_ns_capable - General socket capability test
150 * @sk: Socket to use a capability on or through
151 * @user_ns: The user namespace of the capability to use
152 * @cap: The capability to use
153 *
154 * Test to see if the opener of the socket had when the socket was
155 * created and the current process has the capability @cap in the user
156 * namespace @user_ns.
157 */
158bool sk_ns_capable(const struct sock *sk,
159		   struct user_namespace *user_ns, int cap)
160{
161	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
162		ns_capable(user_ns, cap);
163}
164EXPORT_SYMBOL(sk_ns_capable);
165
166/**
167 * sk_capable - Socket global capability test
168 * @sk: Socket to use a capability on or through
169 * @cap: The global capability to use
170 *
171 * Test to see if the opener of the socket had when the socket was
172 * created and the current process has the capability @cap in all user
173 * namespaces.
174 */
175bool sk_capable(const struct sock *sk, int cap)
176{
177	return sk_ns_capable(sk, &init_user_ns, cap);
178}
179EXPORT_SYMBOL(sk_capable);
180
181/**
182 * sk_net_capable - Network namespace socket capability test
183 * @sk: Socket to use a capability on or through
184 * @cap: The capability to use
185 *
186 * Test to see if the opener of the socket had when the socket was created
187 * and the current process has the capability @cap over the network namespace
188 * the socket is a member of.
189 */
190bool sk_net_capable(const struct sock *sk, int cap)
191{
192	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
193}
194EXPORT_SYMBOL(sk_net_capable);
195
196
197#ifdef CONFIG_MEMCG_KMEM
198int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
199{
200	struct proto *proto;
201	int ret = 0;
202
203	mutex_lock(&proto_list_mutex);
204	list_for_each_entry(proto, &proto_list, node) {
205		if (proto->init_cgroup) {
206			ret = proto->init_cgroup(memcg, ss);
207			if (ret)
208				goto out;
209		}
210	}
211
212	mutex_unlock(&proto_list_mutex);
213	return ret;
214out:
215	list_for_each_entry_continue_reverse(proto, &proto_list, node)
216		if (proto->destroy_cgroup)
217			proto->destroy_cgroup(memcg);
218	mutex_unlock(&proto_list_mutex);
219	return ret;
220}
221
222void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
223{
224	struct proto *proto;
225
226	mutex_lock(&proto_list_mutex);
227	list_for_each_entry_reverse(proto, &proto_list, node)
228		if (proto->destroy_cgroup)
229			proto->destroy_cgroup(memcg);
230	mutex_unlock(&proto_list_mutex);
231}
232#endif
233
234/*
235 * Each address family might have different locking rules, so we have
236 * one slock key per address family:
237 */
238static struct lock_class_key af_family_keys[AF_MAX];
239static struct lock_class_key af_family_slock_keys[AF_MAX];
240
241#if defined(CONFIG_MEMCG_KMEM)
242struct static_key memcg_socket_limit_enabled;
243EXPORT_SYMBOL(memcg_socket_limit_enabled);
244#endif
245
246/*
247 * Make lock validator output more readable. (we pre-construct these
248 * strings build-time, so that runtime initialization of socket
249 * locks is fast):
250 */
251static const char *const af_family_key_strings[AF_MAX+1] = {
252  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
253  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
254  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
255  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
256  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
257  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
258  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
259  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
260  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
261  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
262  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
263  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
264  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
265  "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
266};
267static const char *const af_family_slock_key_strings[AF_MAX+1] = {
268  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
269  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
270  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
271  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
272  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
273  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
274  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
275  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
276  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
277  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
278  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
279  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
280  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
281  "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
282};
283static const char *const af_family_clock_key_strings[AF_MAX+1] = {
284  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
285  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
286  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
287  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
288  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
289  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
290  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
291  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
292  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
293  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
294  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
295  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
296  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
297  "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
298};
299
300/*
301 * sk_callback_lock locking rules are per-address-family,
302 * so split the lock classes by using a per-AF key:
303 */
304static struct lock_class_key af_callback_keys[AF_MAX];
305
306/* Take into consideration the size of the struct sk_buff overhead in the
307 * determination of these values, since that is non-constant across
308 * platforms.  This makes socket queueing behavior and performance
309 * not depend upon such differences.
310 */
311#define _SK_MEM_PACKETS		256
312#define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
313#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
314#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
315
316/* Run time adjustable parameters. */
317__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
318EXPORT_SYMBOL(sysctl_wmem_max);
319__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
320EXPORT_SYMBOL(sysctl_rmem_max);
321__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
322__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
323
324/* Maximal space eaten by iovec or ancillary data plus some space */
325int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
326EXPORT_SYMBOL(sysctl_optmem_max);
327
328struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
329EXPORT_SYMBOL_GPL(memalloc_socks);
330
331/**
332 * sk_set_memalloc - sets %SOCK_MEMALLOC
333 * @sk: socket to set it on
334 *
335 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
336 * It's the responsibility of the admin to adjust min_free_kbytes
337 * to meet the requirements
338 */
339void sk_set_memalloc(struct sock *sk)
340{
341	sock_set_flag(sk, SOCK_MEMALLOC);
342	sk->sk_allocation |= __GFP_MEMALLOC;
343	static_key_slow_inc(&memalloc_socks);
344}
345EXPORT_SYMBOL_GPL(sk_set_memalloc);
346
347void sk_clear_memalloc(struct sock *sk)
348{
349	sock_reset_flag(sk, SOCK_MEMALLOC);
350	sk->sk_allocation &= ~__GFP_MEMALLOC;
351	static_key_slow_dec(&memalloc_socks);
352
353	/*
354	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
355	 * progress of swapping. However, if SOCK_MEMALLOC is cleared while
356	 * it has rmem allocations there is a risk that the user of the
357	 * socket cannot make forward progress due to exceeding the rmem
358	 * limits. By rights, sk_clear_memalloc() should only be called
359	 * on sockets being torn down but warn and reset the accounting if
360	 * that assumption breaks.
361	 */
362	if (WARN_ON(sk->sk_forward_alloc))
363		sk_mem_reclaim(sk);
364}
365EXPORT_SYMBOL_GPL(sk_clear_memalloc);
366
367int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
368{
369	int ret;
370	unsigned long pflags = current->flags;
371
372	/* these should have been dropped before queueing */
373	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
374
375	current->flags |= PF_MEMALLOC;
376	ret = sk->sk_backlog_rcv(sk, skb);
377	tsk_restore_flags(current, pflags, PF_MEMALLOC);
378
379	return ret;
380}
381EXPORT_SYMBOL(__sk_backlog_rcv);
382
383static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
384{
385	struct timeval tv;
386
387	if (optlen < sizeof(tv))
388		return -EINVAL;
389	if (copy_from_user(&tv, optval, sizeof(tv)))
390		return -EFAULT;
391	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
392		return -EDOM;
393
394	if (tv.tv_sec < 0) {
395		static int warned __read_mostly;
396
397		*timeo_p = 0;
398		if (warned < 10 && net_ratelimit()) {
399			warned++;
400			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
401				__func__, current->comm, task_pid_nr(current));
402		}
403		return 0;
404	}
405	*timeo_p = MAX_SCHEDULE_TIMEOUT;
406	if (tv.tv_sec == 0 && tv.tv_usec == 0)
407		return 0;
408	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
409		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
410	return 0;
411}
412
413static void sock_warn_obsolete_bsdism(const char *name)
414{
415	static int warned;
416	static char warncomm[TASK_COMM_LEN];
417	if (strcmp(warncomm, current->comm) && warned < 5) {
418		strcpy(warncomm,  current->comm);
419		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
420			warncomm, name);
421		warned++;
422	}
423}
424
425#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
426
427static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
428{
429	if (sk->sk_flags & flags) {
430		sk->sk_flags &= ~flags;
431		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
432			net_disable_timestamp();
433	}
434}
435
436
437int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
438{
439	int err;
440	unsigned long flags;
441	struct sk_buff_head *list = &sk->sk_receive_queue;
442
443	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
444		atomic_inc(&sk->sk_drops);
445		trace_sock_rcvqueue_full(sk, skb);
446		return -ENOMEM;
447	}
448
449	err = sk_filter(sk, skb);
450	if (err)
451		return err;
452
453	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
454		atomic_inc(&sk->sk_drops);
455		return -ENOBUFS;
456	}
457
458	skb->dev = NULL;
459	skb_set_owner_r(skb, sk);
460
461	/* we escape from rcu protected region, make sure we dont leak
462	 * a norefcounted dst
463	 */
464	skb_dst_force(skb);
465
466	spin_lock_irqsave(&list->lock, flags);
467	skb->dropcount = atomic_read(&sk->sk_drops);
468	__skb_queue_tail(list, skb);
469	spin_unlock_irqrestore(&list->lock, flags);
470
471	if (!sock_flag(sk, SOCK_DEAD))
472		sk->sk_data_ready(sk);
473	return 0;
474}
475EXPORT_SYMBOL(sock_queue_rcv_skb);
476
477int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
478{
479	int rc = NET_RX_SUCCESS;
480
481	if (sk_filter(sk, skb))
482		goto discard_and_relse;
483
484	skb->dev = NULL;
485
486	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
487		atomic_inc(&sk->sk_drops);
488		goto discard_and_relse;
489	}
490	if (nested)
491		bh_lock_sock_nested(sk);
492	else
493		bh_lock_sock(sk);
494	if (!sock_owned_by_user(sk)) {
495		/*
496		 * trylock + unlock semantics:
497		 */
498		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
499
500		rc = sk_backlog_rcv(sk, skb);
501
502		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
503	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
504		bh_unlock_sock(sk);
505		atomic_inc(&sk->sk_drops);
506		goto discard_and_relse;
507	}
508
509	bh_unlock_sock(sk);
510out:
511	sock_put(sk);
512	return rc;
513discard_and_relse:
514	kfree_skb(skb);
515	goto out;
516}
517EXPORT_SYMBOL(sk_receive_skb);
518
519struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
520{
521	struct dst_entry *dst = __sk_dst_get(sk);
522
523	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
524		sk_tx_queue_clear(sk);
525		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
526		dst_release(dst);
527		return NULL;
528	}
529
530	return dst;
531}
532EXPORT_SYMBOL(__sk_dst_check);
533
534struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
535{
536	struct dst_entry *dst = sk_dst_get(sk);
537
538	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
539		sk_dst_reset(sk);
540		dst_release(dst);
541		return NULL;
542	}
543
544	return dst;
545}
546EXPORT_SYMBOL(sk_dst_check);
547
548static int sock_setbindtodevice(struct sock *sk, char __user *optval,
549				int optlen)
550{
551	int ret = -ENOPROTOOPT;
552#ifdef CONFIG_NETDEVICES
553	struct net *net = sock_net(sk);
554	char devname[IFNAMSIZ];
555	int index;
556
557	/* Sorry... */
558	ret = -EPERM;
559	if (!ns_capable(net->user_ns, CAP_NET_RAW))
560		goto out;
561
562	ret = -EINVAL;
563	if (optlen < 0)
564		goto out;
565
566	/* Bind this socket to a particular device like "eth0",
567	 * as specified in the passed interface name. If the
568	 * name is "" or the option length is zero the socket
569	 * is not bound.
570	 */
571	if (optlen > IFNAMSIZ - 1)
572		optlen = IFNAMSIZ - 1;
573	memset(devname, 0, sizeof(devname));
574
575	ret = -EFAULT;
576	if (copy_from_user(devname, optval, optlen))
577		goto out;
578
579	index = 0;
580	if (devname[0] != '\0') {
581		struct net_device *dev;
582
583		rcu_read_lock();
584		dev = dev_get_by_name_rcu(net, devname);
585		if (dev)
586			index = dev->ifindex;
587		rcu_read_unlock();
588		ret = -ENODEV;
589		if (!dev)
590			goto out;
591	}
592
593	lock_sock(sk);
594	sk->sk_bound_dev_if = index;
595	sk_dst_reset(sk);
596	release_sock(sk);
597
598	ret = 0;
599
600out:
601#endif
602
603	return ret;
604}
605
606static int sock_getbindtodevice(struct sock *sk, char __user *optval,
607				int __user *optlen, int len)
608{
609	int ret = -ENOPROTOOPT;
610#ifdef CONFIG_NETDEVICES
611	struct net *net = sock_net(sk);
612	char devname[IFNAMSIZ];
613
614	if (sk->sk_bound_dev_if == 0) {
615		len = 0;
616		goto zero;
617	}
618
619	ret = -EINVAL;
620	if (len < IFNAMSIZ)
621		goto out;
622
623	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
624	if (ret)
625		goto out;
626
627	len = strlen(devname) + 1;
628
629	ret = -EFAULT;
630	if (copy_to_user(optval, devname, len))
631		goto out;
632
633zero:
634	ret = -EFAULT;
635	if (put_user(len, optlen))
636		goto out;
637
638	ret = 0;
639
640out:
641#endif
642
643	return ret;
644}
645
646static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
647{
648	if (valbool)
649		sock_set_flag(sk, bit);
650	else
651		sock_reset_flag(sk, bit);
652}
653
654/*
655 *	This is meant for all protocols to use and covers goings on
656 *	at the socket level. Everything here is generic.
657 */
658
659int sock_setsockopt(struct socket *sock, int level, int optname,
660		    char __user *optval, unsigned int optlen)
661{
662	struct sock *sk = sock->sk;
663	int val;
664	int valbool;
665	struct linger ling;
666	int ret = 0;
667
668	/*
669	 *	Options without arguments
670	 */
671
672	if (optname == SO_BINDTODEVICE)
673		return sock_setbindtodevice(sk, optval, optlen);
674
675	if (optlen < sizeof(int))
676		return -EINVAL;
677
678	if (get_user(val, (int __user *)optval))
679		return -EFAULT;
680
681	valbool = val ? 1 : 0;
682
683	lock_sock(sk);
684
685	switch (optname) {
686	case SO_DEBUG:
687		if (val && !capable(CAP_NET_ADMIN))
688			ret = -EACCES;
689		else
690			sock_valbool_flag(sk, SOCK_DBG, valbool);
691		break;
692	case SO_REUSEADDR:
693		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
694		break;
695	case SO_REUSEPORT:
696		sk->sk_reuseport = valbool;
697		break;
698	case SO_TYPE:
699	case SO_PROTOCOL:
700	case SO_DOMAIN:
701	case SO_ERROR:
702		ret = -ENOPROTOOPT;
703		break;
704	case SO_DONTROUTE:
705		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
706		break;
707	case SO_BROADCAST:
708		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
709		break;
710	case SO_SNDBUF:
711		/* Don't error on this BSD doesn't and if you think
712		 * about it this is right. Otherwise apps have to
713		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
714		 * are treated in BSD as hints
715		 */
716		val = min_t(u32, val, sysctl_wmem_max);
717set_sndbuf:
718		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
719		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
720		/* Wake up sending tasks if we upped the value. */
721		sk->sk_write_space(sk);
722		break;
723
724	case SO_SNDBUFFORCE:
725		if (!capable(CAP_NET_ADMIN)) {
726			ret = -EPERM;
727			break;
728		}
729		goto set_sndbuf;
730
731	case SO_RCVBUF:
732		/* Don't error on this BSD doesn't and if you think
733		 * about it this is right. Otherwise apps have to
734		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
735		 * are treated in BSD as hints
736		 */
737		val = min_t(u32, val, sysctl_rmem_max);
738set_rcvbuf:
739		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
740		/*
741		 * We double it on the way in to account for
742		 * "struct sk_buff" etc. overhead.   Applications
743		 * assume that the SO_RCVBUF setting they make will
744		 * allow that much actual data to be received on that
745		 * socket.
746		 *
747		 * Applications are unaware that "struct sk_buff" and
748		 * other overheads allocate from the receive buffer
749		 * during socket buffer allocation.
750		 *
751		 * And after considering the possible alternatives,
752		 * returning the value we actually used in getsockopt
753		 * is the most desirable behavior.
754		 */
755		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
756		break;
757
758	case SO_RCVBUFFORCE:
759		if (!capable(CAP_NET_ADMIN)) {
760			ret = -EPERM;
761			break;
762		}
763		goto set_rcvbuf;
764
765	case SO_KEEPALIVE:
766#ifdef CONFIG_INET
767		if (sk->sk_protocol == IPPROTO_TCP &&
768		    sk->sk_type == SOCK_STREAM)
769			tcp_set_keepalive(sk, valbool);
770#endif
771		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
772		break;
773
774	case SO_OOBINLINE:
775		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
776		break;
777
778	case SO_NO_CHECK:
779		sk->sk_no_check_tx = valbool;
780		break;
781
782	case SO_PRIORITY:
783		if ((val >= 0 && val <= 6) ||
784		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
785			sk->sk_priority = val;
786		else
787			ret = -EPERM;
788		break;
789
790	case SO_LINGER:
791		if (optlen < sizeof(ling)) {
792			ret = -EINVAL;	/* 1003.1g */
793			break;
794		}
795		if (copy_from_user(&ling, optval, sizeof(ling))) {
796			ret = -EFAULT;
797			break;
798		}
799		if (!ling.l_onoff)
800			sock_reset_flag(sk, SOCK_LINGER);
801		else {
802#if (BITS_PER_LONG == 32)
803			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
804				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
805			else
806#endif
807				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
808			sock_set_flag(sk, SOCK_LINGER);
809		}
810		break;
811
812	case SO_BSDCOMPAT:
813		sock_warn_obsolete_bsdism("setsockopt");
814		break;
815
816	case SO_PASSCRED:
817		if (valbool)
818			set_bit(SOCK_PASSCRED, &sock->flags);
819		else
820			clear_bit(SOCK_PASSCRED, &sock->flags);
821		break;
822
823	case SO_TIMESTAMP:
824	case SO_TIMESTAMPNS:
825		if (valbool)  {
826			if (optname == SO_TIMESTAMP)
827				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
828			else
829				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
830			sock_set_flag(sk, SOCK_RCVTSTAMP);
831			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
832		} else {
833			sock_reset_flag(sk, SOCK_RCVTSTAMP);
834			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
835		}
836		break;
837
838	case SO_TIMESTAMPING:
839		if (val & ~SOF_TIMESTAMPING_MASK) {
840			ret = -EINVAL;
841			break;
842		}
843		if (val & SOF_TIMESTAMPING_OPT_ID &&
844		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
845			if (sk->sk_protocol == IPPROTO_TCP) {
846				if (sk->sk_state != TCP_ESTABLISHED) {
847					ret = -EINVAL;
848					break;
849				}
850				sk->sk_tskey = tcp_sk(sk)->snd_una;
851			} else {
852				sk->sk_tskey = 0;
853			}
854		}
855		sk->sk_tsflags = val;
856		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
857			sock_enable_timestamp(sk,
858					      SOCK_TIMESTAMPING_RX_SOFTWARE);
859		else
860			sock_disable_timestamp(sk,
861					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
862		break;
863
864	case SO_RCVLOWAT:
865		if (val < 0)
866			val = INT_MAX;
867		sk->sk_rcvlowat = val ? : 1;
868		break;
869
870	case SO_RCVTIMEO:
871		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
872		break;
873
874	case SO_SNDTIMEO:
875		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
876		break;
877
878	case SO_ATTACH_FILTER:
879		ret = -EINVAL;
880		if (optlen == sizeof(struct sock_fprog)) {
881			struct sock_fprog fprog;
882
883			ret = -EFAULT;
884			if (copy_from_user(&fprog, optval, sizeof(fprog)))
885				break;
886
887			ret = sk_attach_filter(&fprog, sk);
888		}
889		break;
890
891	case SO_DETACH_FILTER:
892		ret = sk_detach_filter(sk);
893		break;
894
895	case SO_LOCK_FILTER:
896		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
897			ret = -EPERM;
898		else
899			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
900		break;
901
902	case SO_PASSSEC:
903		if (valbool)
904			set_bit(SOCK_PASSSEC, &sock->flags);
905		else
906			clear_bit(SOCK_PASSSEC, &sock->flags);
907		break;
908	case SO_MARK:
909		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
910			ret = -EPERM;
911		else
912			sk->sk_mark = val;
913		break;
914
915		/* We implement the SO_SNDLOWAT etc to
916		   not be settable (1003.1g 5.3) */
917	case SO_RXQ_OVFL:
918		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
919		break;
920
921	case SO_WIFI_STATUS:
922		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
923		break;
924
925	case SO_PEEK_OFF:
926		if (sock->ops->set_peek_off)
927			ret = sock->ops->set_peek_off(sk, val);
928		else
929			ret = -EOPNOTSUPP;
930		break;
931
932	case SO_NOFCS:
933		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
934		break;
935
936	case SO_SELECT_ERR_QUEUE:
937		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
938		break;
939
940#ifdef CONFIG_NET_RX_BUSY_POLL
941	case SO_BUSY_POLL:
942		/* allow unprivileged users to decrease the value */
943		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
944			ret = -EPERM;
945		else {
946			if (val < 0)
947				ret = -EINVAL;
948			else
949				sk->sk_ll_usec = val;
950		}
951		break;
952#endif
953
954	case SO_MAX_PACING_RATE:
955		sk->sk_max_pacing_rate = val;
956		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
957					 sk->sk_max_pacing_rate);
958		break;
959
960	default:
961		ret = -ENOPROTOOPT;
962		break;
963	}
964	release_sock(sk);
965	return ret;
966}
967EXPORT_SYMBOL(sock_setsockopt);
968
969
970static void cred_to_ucred(struct pid *pid, const struct cred *cred,
971			  struct ucred *ucred)
972{
973	ucred->pid = pid_vnr(pid);
974	ucred->uid = ucred->gid = -1;
975	if (cred) {
976		struct user_namespace *current_ns = current_user_ns();
977
978		ucred->uid = from_kuid_munged(current_ns, cred->euid);
979		ucred->gid = from_kgid_munged(current_ns, cred->egid);
980	}
981}
982
983int sock_getsockopt(struct socket *sock, int level, int optname,
984		    char __user *optval, int __user *optlen)
985{
986	struct sock *sk = sock->sk;
987
988	union {
989		int val;
990		struct linger ling;
991		struct timeval tm;
992	} v;
993
994	int lv = sizeof(int);
995	int len;
996
997	if (get_user(len, optlen))
998		return -EFAULT;
999	if (len < 0)
1000		return -EINVAL;
1001
1002	memset(&v, 0, sizeof(v));
1003
1004	switch (optname) {
1005	case SO_DEBUG:
1006		v.val = sock_flag(sk, SOCK_DBG);
1007		break;
1008
1009	case SO_DONTROUTE:
1010		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1011		break;
1012
1013	case SO_BROADCAST:
1014		v.val = sock_flag(sk, SOCK_BROADCAST);
1015		break;
1016
1017	case SO_SNDBUF:
1018		v.val = sk->sk_sndbuf;
1019		break;
1020
1021	case SO_RCVBUF:
1022		v.val = sk->sk_rcvbuf;
1023		break;
1024
1025	case SO_REUSEADDR:
1026		v.val = sk->sk_reuse;
1027		break;
1028
1029	case SO_REUSEPORT:
1030		v.val = sk->sk_reuseport;
1031		break;
1032
1033	case SO_KEEPALIVE:
1034		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1035		break;
1036
1037	case SO_TYPE:
1038		v.val = sk->sk_type;
1039		break;
1040
1041	case SO_PROTOCOL:
1042		v.val = sk->sk_protocol;
1043		break;
1044
1045	case SO_DOMAIN:
1046		v.val = sk->sk_family;
1047		break;
1048
1049	case SO_ERROR:
1050		v.val = -sock_error(sk);
1051		if (v.val == 0)
1052			v.val = xchg(&sk->sk_err_soft, 0);
1053		break;
1054
1055	case SO_OOBINLINE:
1056		v.val = sock_flag(sk, SOCK_URGINLINE);
1057		break;
1058
1059	case SO_NO_CHECK:
1060		v.val = sk->sk_no_check_tx;
1061		break;
1062
1063	case SO_PRIORITY:
1064		v.val = sk->sk_priority;
1065		break;
1066
1067	case SO_LINGER:
1068		lv		= sizeof(v.ling);
1069		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1070		v.ling.l_linger	= sk->sk_lingertime / HZ;
1071		break;
1072
1073	case SO_BSDCOMPAT:
1074		sock_warn_obsolete_bsdism("getsockopt");
1075		break;
1076
1077	case SO_TIMESTAMP:
1078		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1079				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1080		break;
1081
1082	case SO_TIMESTAMPNS:
1083		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1084		break;
1085
1086	case SO_TIMESTAMPING:
1087		v.val = sk->sk_tsflags;
1088		break;
1089
1090	case SO_RCVTIMEO:
1091		lv = sizeof(struct timeval);
1092		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1093			v.tm.tv_sec = 0;
1094			v.tm.tv_usec = 0;
1095		} else {
1096			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1097			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1098		}
1099		break;
1100
1101	case SO_SNDTIMEO:
1102		lv = sizeof(struct timeval);
1103		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1104			v.tm.tv_sec = 0;
1105			v.tm.tv_usec = 0;
1106		} else {
1107			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1108			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1109		}
1110		break;
1111
1112	case SO_RCVLOWAT:
1113		v.val = sk->sk_rcvlowat;
1114		break;
1115
1116	case SO_SNDLOWAT:
1117		v.val = 1;
1118		break;
1119
1120	case SO_PASSCRED:
1121		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1122		break;
1123
1124	case SO_PEERCRED:
1125	{
1126		struct ucred peercred;
1127		if (len > sizeof(peercred))
1128			len = sizeof(peercred);
1129		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1130		if (copy_to_user(optval, &peercred, len))
1131			return -EFAULT;
1132		goto lenout;
1133	}
1134
1135	case SO_PEERNAME:
1136	{
1137		char address[128];
1138
1139		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1140			return -ENOTCONN;
1141		if (lv < len)
1142			return -EINVAL;
1143		if (copy_to_user(optval, address, len))
1144			return -EFAULT;
1145		goto lenout;
1146	}
1147
1148	/* Dubious BSD thing... Probably nobody even uses it, but
1149	 * the UNIX standard wants it for whatever reason... -DaveM
1150	 */
1151	case SO_ACCEPTCONN:
1152		v.val = sk->sk_state == TCP_LISTEN;
1153		break;
1154
1155	case SO_PASSSEC:
1156		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1157		break;
1158
1159	case SO_PEERSEC:
1160		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1161
1162	case SO_MARK:
1163		v.val = sk->sk_mark;
1164		break;
1165
1166	case SO_RXQ_OVFL:
1167		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1168		break;
1169
1170	case SO_WIFI_STATUS:
1171		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1172		break;
1173
1174	case SO_PEEK_OFF:
1175		if (!sock->ops->set_peek_off)
1176			return -EOPNOTSUPP;
1177
1178		v.val = sk->sk_peek_off;
1179		break;
1180	case SO_NOFCS:
1181		v.val = sock_flag(sk, SOCK_NOFCS);
1182		break;
1183
1184	case SO_BINDTODEVICE:
1185		return sock_getbindtodevice(sk, optval, optlen, len);
1186
1187	case SO_GET_FILTER:
1188		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1189		if (len < 0)
1190			return len;
1191
1192		goto lenout;
1193
1194	case SO_LOCK_FILTER:
1195		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1196		break;
1197
1198	case SO_BPF_EXTENSIONS:
1199		v.val = bpf_tell_extensions();
1200		break;
1201
1202	case SO_SELECT_ERR_QUEUE:
1203		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1204		break;
1205
1206#ifdef CONFIG_NET_RX_BUSY_POLL
1207	case SO_BUSY_POLL:
1208		v.val = sk->sk_ll_usec;
1209		break;
1210#endif
1211
1212	case SO_MAX_PACING_RATE:
1213		v.val = sk->sk_max_pacing_rate;
1214		break;
1215
1216	default:
1217		return -ENOPROTOOPT;
1218	}
1219
1220	if (len > lv)
1221		len = lv;
1222	if (copy_to_user(optval, &v, len))
1223		return -EFAULT;
1224lenout:
1225	if (put_user(len, optlen))
1226		return -EFAULT;
1227	return 0;
1228}
1229
1230/*
1231 * Initialize an sk_lock.
1232 *
1233 * (We also register the sk_lock with the lock validator.)
1234 */
1235static inline void sock_lock_init(struct sock *sk)
1236{
1237	sock_lock_init_class_and_name(sk,
1238			af_family_slock_key_strings[sk->sk_family],
1239			af_family_slock_keys + sk->sk_family,
1240			af_family_key_strings[sk->sk_family],
1241			af_family_keys + sk->sk_family);
1242}
1243
1244/*
1245 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1246 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1247 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1248 */
1249static void sock_copy(struct sock *nsk, const struct sock *osk)
1250{
1251#ifdef CONFIG_SECURITY_NETWORK
1252	void *sptr = nsk->sk_security;
1253#endif
1254	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1255
1256	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1257	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1258
1259#ifdef CONFIG_SECURITY_NETWORK
1260	nsk->sk_security = sptr;
1261	security_sk_clone(osk, nsk);
1262#endif
1263}
1264
1265void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1266{
1267	unsigned long nulls1, nulls2;
1268
1269	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1270	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1271	if (nulls1 > nulls2)
1272		swap(nulls1, nulls2);
1273
1274	if (nulls1 != 0)
1275		memset((char *)sk, 0, nulls1);
1276	memset((char *)sk + nulls1 + sizeof(void *), 0,
1277	       nulls2 - nulls1 - sizeof(void *));
1278	memset((char *)sk + nulls2 + sizeof(void *), 0,
1279	       size - nulls2 - sizeof(void *));
1280}
1281EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1282
1283static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1284		int family)
1285{
1286	struct sock *sk;
1287	struct kmem_cache *slab;
1288
1289	slab = prot->slab;
1290	if (slab != NULL) {
1291		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1292		if (!sk)
1293			return sk;
1294		if (priority & __GFP_ZERO) {
1295			if (prot->clear_sk)
1296				prot->clear_sk(sk, prot->obj_size);
1297			else
1298				sk_prot_clear_nulls(sk, prot->obj_size);
1299		}
1300	} else
1301		sk = kmalloc(prot->obj_size, priority);
1302
1303	if (sk != NULL) {
1304		kmemcheck_annotate_bitfield(sk, flags);
1305
1306		if (security_sk_alloc(sk, family, priority))
1307			goto out_free;
1308
1309		if (!try_module_get(prot->owner))
1310			goto out_free_sec;
1311		sk_tx_queue_clear(sk);
1312	}
1313
1314	return sk;
1315
1316out_free_sec:
1317	security_sk_free(sk);
1318out_free:
1319	if (slab != NULL)
1320		kmem_cache_free(slab, sk);
1321	else
1322		kfree(sk);
1323	return NULL;
1324}
1325
1326static void sk_prot_free(struct proto *prot, struct sock *sk)
1327{
1328	struct kmem_cache *slab;
1329	struct module *owner;
1330
1331	owner = prot->owner;
1332	slab = prot->slab;
1333
1334	security_sk_free(sk);
1335	if (slab != NULL)
1336		kmem_cache_free(slab, sk);
1337	else
1338		kfree(sk);
1339	module_put(owner);
1340}
1341
1342#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
1343void sock_update_netprioidx(struct sock *sk)
1344{
1345	if (in_interrupt())
1346		return;
1347
1348	sk->sk_cgrp_prioidx = task_netprioidx(current);
1349}
1350EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1351#endif
1352
1353/**
1354 *	sk_alloc - All socket objects are allocated here
1355 *	@net: the applicable net namespace
1356 *	@family: protocol family
1357 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1358 *	@prot: struct proto associated with this new sock instance
1359 */
1360struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1361		      struct proto *prot)
1362{
1363	struct sock *sk;
1364
1365	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1366	if (sk) {
1367		sk->sk_family = family;
1368		/*
1369		 * See comment in struct sock definition to understand
1370		 * why we need sk_prot_creator -acme
1371		 */
1372		sk->sk_prot = sk->sk_prot_creator = prot;
1373		sock_lock_init(sk);
1374		sock_net_set(sk, get_net(net));
1375		atomic_set(&sk->sk_wmem_alloc, 1);
1376
1377		sock_update_classid(sk);
1378		sock_update_netprioidx(sk);
1379	}
1380
1381	return sk;
1382}
1383EXPORT_SYMBOL(sk_alloc);
1384
1385static void __sk_free(struct sock *sk)
1386{
1387	struct sk_filter *filter;
1388
1389	if (sk->sk_destruct)
1390		sk->sk_destruct(sk);
1391
1392	filter = rcu_dereference_check(sk->sk_filter,
1393				       atomic_read(&sk->sk_wmem_alloc) == 0);
1394	if (filter) {
1395		sk_filter_uncharge(sk, filter);
1396		RCU_INIT_POINTER(sk->sk_filter, NULL);
1397	}
1398
1399	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1400
1401	if (atomic_read(&sk->sk_omem_alloc))
1402		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1403			 __func__, atomic_read(&sk->sk_omem_alloc));
1404
1405	if (sk->sk_peer_cred)
1406		put_cred(sk->sk_peer_cred);
1407	put_pid(sk->sk_peer_pid);
1408	put_net(sock_net(sk));
1409	sk_prot_free(sk->sk_prot_creator, sk);
1410}
1411
1412void sk_free(struct sock *sk)
1413{
1414	/*
1415	 * We subtract one from sk_wmem_alloc and can know if
1416	 * some packets are still in some tx queue.
1417	 * If not null, sock_wfree() will call __sk_free(sk) later
1418	 */
1419	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1420		__sk_free(sk);
1421}
1422EXPORT_SYMBOL(sk_free);
1423
1424/*
1425 * Last sock_put should drop reference to sk->sk_net. It has already
1426 * been dropped in sk_change_net. Taking reference to stopping namespace
1427 * is not an option.
1428 * Take reference to a socket to remove it from hash _alive_ and after that
1429 * destroy it in the context of init_net.
1430 */
1431void sk_release_kernel(struct sock *sk)
1432{
1433	if (sk == NULL || sk->sk_socket == NULL)
1434		return;
1435
1436	sock_hold(sk);
1437	sock_release(sk->sk_socket);
1438	release_net(sock_net(sk));
1439	sock_net_set(sk, get_net(&init_net));
1440	sock_put(sk);
1441}
1442EXPORT_SYMBOL(sk_release_kernel);
1443
1444static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1445{
1446	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1447		sock_update_memcg(newsk);
1448}
1449
1450/**
1451 *	sk_clone_lock - clone a socket, and lock its clone
1452 *	@sk: the socket to clone
1453 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1454 *
1455 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1456 */
1457struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1458{
1459	struct sock *newsk;
1460	bool is_charged = true;
1461
1462	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1463	if (newsk != NULL) {
1464		struct sk_filter *filter;
1465
1466		sock_copy(newsk, sk);
1467
1468		/* SANITY */
1469		get_net(sock_net(newsk));
1470		sk_node_init(&newsk->sk_node);
1471		sock_lock_init(newsk);
1472		bh_lock_sock(newsk);
1473		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1474		newsk->sk_backlog.len = 0;
1475
1476		atomic_set(&newsk->sk_rmem_alloc, 0);
1477		/*
1478		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1479		 */
1480		atomic_set(&newsk->sk_wmem_alloc, 1);
1481		atomic_set(&newsk->sk_omem_alloc, 0);
1482		skb_queue_head_init(&newsk->sk_receive_queue);
1483		skb_queue_head_init(&newsk->sk_write_queue);
1484
1485		spin_lock_init(&newsk->sk_dst_lock);
1486		rwlock_init(&newsk->sk_callback_lock);
1487		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1488				af_callback_keys + newsk->sk_family,
1489				af_family_clock_key_strings[newsk->sk_family]);
1490
1491		newsk->sk_dst_cache	= NULL;
1492		newsk->sk_wmem_queued	= 0;
1493		newsk->sk_forward_alloc = 0;
1494		newsk->sk_send_head	= NULL;
1495		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1496
1497		sock_reset_flag(newsk, SOCK_DONE);
1498		skb_queue_head_init(&newsk->sk_error_queue);
1499
1500		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1501		if (filter != NULL)
1502			/* though it's an empty new sock, the charging may fail
1503			 * if sysctl_optmem_max was changed between creation of
1504			 * original socket and cloning
1505			 */
1506			is_charged = sk_filter_charge(newsk, filter);
1507
1508		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) {
1509			/* It is still raw copy of parent, so invalidate
1510			 * destructor and make plain sk_free() */
1511			newsk->sk_destruct = NULL;
1512			bh_unlock_sock(newsk);
1513			sk_free(newsk);
1514			newsk = NULL;
1515			goto out;
1516		}
1517
1518		newsk->sk_err	   = 0;
1519		newsk->sk_priority = 0;
1520		/*
1521		 * Before updating sk_refcnt, we must commit prior changes to memory
1522		 * (Documentation/RCU/rculist_nulls.txt for details)
1523		 */
1524		smp_wmb();
1525		atomic_set(&newsk->sk_refcnt, 2);
1526
1527		/*
1528		 * Increment the counter in the same struct proto as the master
1529		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1530		 * is the same as sk->sk_prot->socks, as this field was copied
1531		 * with memcpy).
1532		 *
1533		 * This _changes_ the previous behaviour, where
1534		 * tcp_create_openreq_child always was incrementing the
1535		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1536		 * to be taken into account in all callers. -acme
1537		 */
1538		sk_refcnt_debug_inc(newsk);
1539		sk_set_socket(newsk, NULL);
1540		newsk->sk_wq = NULL;
1541
1542		sk_update_clone(sk, newsk);
1543
1544		if (newsk->sk_prot->sockets_allocated)
1545			sk_sockets_allocated_inc(newsk);
1546
1547		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1548			net_enable_timestamp();
1549	}
1550out:
1551	return newsk;
1552}
1553EXPORT_SYMBOL_GPL(sk_clone_lock);
1554
1555void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1556{
1557	__sk_dst_set(sk, dst);
1558	sk->sk_route_caps = dst->dev->features;
1559	if (sk->sk_route_caps & NETIF_F_GSO)
1560		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1561	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1562	if (sk_can_gso(sk)) {
1563		if (dst->header_len) {
1564			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1565		} else {
1566			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1567			sk->sk_gso_max_size = dst->dev->gso_max_size;
1568			sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1569		}
1570	}
1571}
1572EXPORT_SYMBOL_GPL(sk_setup_caps);
1573
1574/*
1575 *	Simple resource managers for sockets.
1576 */
1577
1578
1579/*
1580 * Write buffer destructor automatically called from kfree_skb.
1581 */
1582void sock_wfree(struct sk_buff *skb)
1583{
1584	struct sock *sk = skb->sk;
1585	unsigned int len = skb->truesize;
1586
1587	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1588		/*
1589		 * Keep a reference on sk_wmem_alloc, this will be released
1590		 * after sk_write_space() call
1591		 */
1592		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1593		sk->sk_write_space(sk);
1594		len = 1;
1595	}
1596	/*
1597	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1598	 * could not do because of in-flight packets
1599	 */
1600	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1601		__sk_free(sk);
1602}
1603EXPORT_SYMBOL(sock_wfree);
1604
1605void skb_orphan_partial(struct sk_buff *skb)
1606{
1607	/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1608	 * so we do not completely orphan skb, but transfert all
1609	 * accounted bytes but one, to avoid unexpected reorders.
1610	 */
1611	if (skb->destructor == sock_wfree
1612#ifdef CONFIG_INET
1613	    || skb->destructor == tcp_wfree
1614#endif
1615		) {
1616		atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1617		skb->truesize = 1;
1618	} else {
1619		skb_orphan(skb);
1620	}
1621}
1622EXPORT_SYMBOL(skb_orphan_partial);
1623
1624/*
1625 * Read buffer destructor automatically called from kfree_skb.
1626 */
1627void sock_rfree(struct sk_buff *skb)
1628{
1629	struct sock *sk = skb->sk;
1630	unsigned int len = skb->truesize;
1631
1632	atomic_sub(len, &sk->sk_rmem_alloc);
1633	sk_mem_uncharge(sk, len);
1634}
1635EXPORT_SYMBOL(sock_rfree);
1636
1637void sock_efree(struct sk_buff *skb)
1638{
1639	sock_put(skb->sk);
1640}
1641EXPORT_SYMBOL(sock_efree);
1642
1643#ifdef CONFIG_INET
1644void sock_edemux(struct sk_buff *skb)
1645{
1646	struct sock *sk = skb->sk;
1647
1648	if (sk->sk_state == TCP_TIME_WAIT)
1649		inet_twsk_put(inet_twsk(sk));
1650	else
1651		sock_put(sk);
1652}
1653EXPORT_SYMBOL(sock_edemux);
1654#endif
1655
1656kuid_t sock_i_uid(struct sock *sk)
1657{
1658	kuid_t uid;
1659
1660	read_lock_bh(&sk->sk_callback_lock);
1661	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1662	read_unlock_bh(&sk->sk_callback_lock);
1663	return uid;
1664}
1665EXPORT_SYMBOL(sock_i_uid);
1666
1667unsigned long sock_i_ino(struct sock *sk)
1668{
1669	unsigned long ino;
1670
1671	read_lock_bh(&sk->sk_callback_lock);
1672	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1673	read_unlock_bh(&sk->sk_callback_lock);
1674	return ino;
1675}
1676EXPORT_SYMBOL(sock_i_ino);
1677
1678/*
1679 * Allocate a skb from the socket's send buffer.
1680 */
1681struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1682			     gfp_t priority)
1683{
1684	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1685		struct sk_buff *skb = alloc_skb(size, priority);
1686		if (skb) {
1687			skb_set_owner_w(skb, sk);
1688			return skb;
1689		}
1690	}
1691	return NULL;
1692}
1693EXPORT_SYMBOL(sock_wmalloc);
1694
1695/*
1696 * Allocate a memory block from the socket's option memory buffer.
1697 */
1698void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1699{
1700	if ((unsigned int)size <= sysctl_optmem_max &&
1701	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1702		void *mem;
1703		/* First do the add, to avoid the race if kmalloc
1704		 * might sleep.
1705		 */
1706		atomic_add(size, &sk->sk_omem_alloc);
1707		mem = kmalloc(size, priority);
1708		if (mem)
1709			return mem;
1710		atomic_sub(size, &sk->sk_omem_alloc);
1711	}
1712	return NULL;
1713}
1714EXPORT_SYMBOL(sock_kmalloc);
1715
1716/*
1717 * Free an option memory block.
1718 */
1719void sock_kfree_s(struct sock *sk, void *mem, int size)
1720{
1721	if (WARN_ON_ONCE(!mem))
1722		return;
1723	kfree(mem);
1724	atomic_sub(size, &sk->sk_omem_alloc);
1725}
1726EXPORT_SYMBOL(sock_kfree_s);
1727
1728/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1729   I think, these locks should be removed for datagram sockets.
1730 */
1731static long sock_wait_for_wmem(struct sock *sk, long timeo)
1732{
1733	DEFINE_WAIT(wait);
1734
1735	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1736	for (;;) {
1737		if (!timeo)
1738			break;
1739		if (signal_pending(current))
1740			break;
1741		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1742		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1743		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1744			break;
1745		if (sk->sk_shutdown & SEND_SHUTDOWN)
1746			break;
1747		if (sk->sk_err)
1748			break;
1749		timeo = schedule_timeout(timeo);
1750	}
1751	finish_wait(sk_sleep(sk), &wait);
1752	return timeo;
1753}
1754
1755
1756/*
1757 *	Generic send/receive buffer handlers
1758 */
1759
1760struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1761				     unsigned long data_len, int noblock,
1762				     int *errcode, int max_page_order)
1763{
1764	struct sk_buff *skb;
1765	long timeo;
1766	int err;
1767
1768	timeo = sock_sndtimeo(sk, noblock);
1769	for (;;) {
1770		err = sock_error(sk);
1771		if (err != 0)
1772			goto failure;
1773
1774		err = -EPIPE;
1775		if (sk->sk_shutdown & SEND_SHUTDOWN)
1776			goto failure;
1777
1778		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1779			break;
1780
1781		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1782		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1783		err = -EAGAIN;
1784		if (!timeo)
1785			goto failure;
1786		if (signal_pending(current))
1787			goto interrupted;
1788		timeo = sock_wait_for_wmem(sk, timeo);
1789	}
1790	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1791				   errcode, sk->sk_allocation);
1792	if (skb)
1793		skb_set_owner_w(skb, sk);
1794	return skb;
1795
1796interrupted:
1797	err = sock_intr_errno(timeo);
1798failure:
1799	*errcode = err;
1800	return NULL;
1801}
1802EXPORT_SYMBOL(sock_alloc_send_pskb);
1803
1804struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1805				    int noblock, int *errcode)
1806{
1807	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1808}
1809EXPORT_SYMBOL(sock_alloc_send_skb);
1810
1811/* On 32bit arches, an skb frag is limited to 2^15 */
1812#define SKB_FRAG_PAGE_ORDER	get_order(32768)
1813
1814/**
1815 * skb_page_frag_refill - check that a page_frag contains enough room
1816 * @sz: minimum size of the fragment we want to get
1817 * @pfrag: pointer to page_frag
1818 * @gfp: priority for memory allocation
1819 *
1820 * Note: While this allocator tries to use high order pages, there is
1821 * no guarantee that allocations succeed. Therefore, @sz MUST be
1822 * less or equal than PAGE_SIZE.
1823 */
1824bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1825{
1826	if (pfrag->page) {
1827		if (atomic_read(&pfrag->page->_count) == 1) {
1828			pfrag->offset = 0;
1829			return true;
1830		}
1831		if (pfrag->offset + sz <= pfrag->size)
1832			return true;
1833		put_page(pfrag->page);
1834	}
1835
1836	pfrag->offset = 0;
1837	if (SKB_FRAG_PAGE_ORDER) {
1838		pfrag->page = alloc_pages(gfp | __GFP_COMP |
1839					  __GFP_NOWARN | __GFP_NORETRY,
1840					  SKB_FRAG_PAGE_ORDER);
1841		if (likely(pfrag->page)) {
1842			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
1843			return true;
1844		}
1845	}
1846	pfrag->page = alloc_page(gfp);
1847	if (likely(pfrag->page)) {
1848		pfrag->size = PAGE_SIZE;
1849		return true;
1850	}
1851	return false;
1852}
1853EXPORT_SYMBOL(skb_page_frag_refill);
1854
1855bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1856{
1857	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1858		return true;
1859
1860	sk_enter_memory_pressure(sk);
1861	sk_stream_moderate_sndbuf(sk);
1862	return false;
1863}
1864EXPORT_SYMBOL(sk_page_frag_refill);
1865
1866static void __lock_sock(struct sock *sk)
1867	__releases(&sk->sk_lock.slock)
1868	__acquires(&sk->sk_lock.slock)
1869{
1870	DEFINE_WAIT(wait);
1871
1872	for (;;) {
1873		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1874					TASK_UNINTERRUPTIBLE);
1875		spin_unlock_bh(&sk->sk_lock.slock);
1876		schedule();
1877		spin_lock_bh(&sk->sk_lock.slock);
1878		if (!sock_owned_by_user(sk))
1879			break;
1880	}
1881	finish_wait(&sk->sk_lock.wq, &wait);
1882}
1883
1884static void __release_sock(struct sock *sk)
1885	__releases(&sk->sk_lock.slock)
1886	__acquires(&sk->sk_lock.slock)
1887{
1888	struct sk_buff *skb = sk->sk_backlog.head;
1889
1890	do {
1891		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1892		bh_unlock_sock(sk);
1893
1894		do {
1895			struct sk_buff *next = skb->next;
1896
1897			prefetch(next);
1898			WARN_ON_ONCE(skb_dst_is_noref(skb));
1899			skb->next = NULL;
1900			sk_backlog_rcv(sk, skb);
1901
1902			/*
1903			 * We are in process context here with softirqs
1904			 * disabled, use cond_resched_softirq() to preempt.
1905			 * This is safe to do because we've taken the backlog
1906			 * queue private:
1907			 */
1908			cond_resched_softirq();
1909
1910			skb = next;
1911		} while (skb != NULL);
1912
1913		bh_lock_sock(sk);
1914	} while ((skb = sk->sk_backlog.head) != NULL);
1915
1916	/*
1917	 * Doing the zeroing here guarantee we can not loop forever
1918	 * while a wild producer attempts to flood us.
1919	 */
1920	sk->sk_backlog.len = 0;
1921}
1922
1923/**
1924 * sk_wait_data - wait for data to arrive at sk_receive_queue
1925 * @sk:    sock to wait on
1926 * @timeo: for how long
1927 *
1928 * Now socket state including sk->sk_err is changed only under lock,
1929 * hence we may omit checks after joining wait queue.
1930 * We check receive queue before schedule() only as optimization;
1931 * it is very likely that release_sock() added new data.
1932 */
1933int sk_wait_data(struct sock *sk, long *timeo)
1934{
1935	int rc;
1936	DEFINE_WAIT(wait);
1937
1938	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1939	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1940	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1941	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1942	finish_wait(sk_sleep(sk), &wait);
1943	return rc;
1944}
1945EXPORT_SYMBOL(sk_wait_data);
1946
1947/**
1948 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1949 *	@sk: socket
1950 *	@size: memory size to allocate
1951 *	@kind: allocation type
1952 *
1953 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1954 *	rmem allocation. This function assumes that protocols which have
1955 *	memory_pressure use sk_wmem_queued as write buffer accounting.
1956 */
1957int __sk_mem_schedule(struct sock *sk, int size, int kind)
1958{
1959	struct proto *prot = sk->sk_prot;
1960	int amt = sk_mem_pages(size);
1961	long allocated;
1962	int parent_status = UNDER_LIMIT;
1963
1964	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1965
1966	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1967
1968	/* Under limit. */
1969	if (parent_status == UNDER_LIMIT &&
1970			allocated <= sk_prot_mem_limits(sk, 0)) {
1971		sk_leave_memory_pressure(sk);
1972		return 1;
1973	}
1974
1975	/* Under pressure. (we or our parents) */
1976	if ((parent_status > SOFT_LIMIT) ||
1977			allocated > sk_prot_mem_limits(sk, 1))
1978		sk_enter_memory_pressure(sk);
1979
1980	/* Over hard limit (we or our parents) */
1981	if ((parent_status == OVER_LIMIT) ||
1982			(allocated > sk_prot_mem_limits(sk, 2)))
1983		goto suppress_allocation;
1984
1985	/* guarantee minimum buffer size under pressure */
1986	if (kind == SK_MEM_RECV) {
1987		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1988			return 1;
1989
1990	} else { /* SK_MEM_SEND */
1991		if (sk->sk_type == SOCK_STREAM) {
1992			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1993				return 1;
1994		} else if (atomic_read(&sk->sk_wmem_alloc) <
1995			   prot->sysctl_wmem[0])
1996				return 1;
1997	}
1998
1999	if (sk_has_memory_pressure(sk)) {
2000		int alloc;
2001
2002		if (!sk_under_memory_pressure(sk))
2003			return 1;
2004		alloc = sk_sockets_allocated_read_positive(sk);
2005		if (sk_prot_mem_limits(sk, 2) > alloc *
2006		    sk_mem_pages(sk->sk_wmem_queued +
2007				 atomic_read(&sk->sk_rmem_alloc) +
2008				 sk->sk_forward_alloc))
2009			return 1;
2010	}
2011
2012suppress_allocation:
2013
2014	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2015		sk_stream_moderate_sndbuf(sk);
2016
2017		/* Fail only if socket is _under_ its sndbuf.
2018		 * In this case we cannot block, so that we have to fail.
2019		 */
2020		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2021			return 1;
2022	}
2023
2024	trace_sock_exceed_buf_limit(sk, prot, allocated);
2025
2026	/* Alas. Undo changes. */
2027	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2028
2029	sk_memory_allocated_sub(sk, amt);
2030
2031	return 0;
2032}
2033EXPORT_SYMBOL(__sk_mem_schedule);
2034
2035/**
2036 *	__sk_reclaim - reclaim memory_allocated
2037 *	@sk: socket
2038 */
2039void __sk_mem_reclaim(struct sock *sk)
2040{
2041	sk_memory_allocated_sub(sk,
2042				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2043	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2044
2045	if (sk_under_memory_pressure(sk) &&
2046	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2047		sk_leave_memory_pressure(sk);
2048}
2049EXPORT_SYMBOL(__sk_mem_reclaim);
2050
2051
2052/*
2053 * Set of default routines for initialising struct proto_ops when
2054 * the protocol does not support a particular function. In certain
2055 * cases where it makes no sense for a protocol to have a "do nothing"
2056 * function, some default processing is provided.
2057 */
2058
2059int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2060{
2061	return -EOPNOTSUPP;
2062}
2063EXPORT_SYMBOL(sock_no_bind);
2064
2065int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2066		    int len, int flags)
2067{
2068	return -EOPNOTSUPP;
2069}
2070EXPORT_SYMBOL(sock_no_connect);
2071
2072int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2073{
2074	return -EOPNOTSUPP;
2075}
2076EXPORT_SYMBOL(sock_no_socketpair);
2077
2078int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2079{
2080	return -EOPNOTSUPP;
2081}
2082EXPORT_SYMBOL(sock_no_accept);
2083
2084int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2085		    int *len, int peer)
2086{
2087	return -EOPNOTSUPP;
2088}
2089EXPORT_SYMBOL(sock_no_getname);
2090
2091unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2092{
2093	return 0;
2094}
2095EXPORT_SYMBOL(sock_no_poll);
2096
2097int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2098{
2099	return -EOPNOTSUPP;
2100}
2101EXPORT_SYMBOL(sock_no_ioctl);
2102
2103int sock_no_listen(struct socket *sock, int backlog)
2104{
2105	return -EOPNOTSUPP;
2106}
2107EXPORT_SYMBOL(sock_no_listen);
2108
2109int sock_no_shutdown(struct socket *sock, int how)
2110{
2111	return -EOPNOTSUPP;
2112}
2113EXPORT_SYMBOL(sock_no_shutdown);
2114
2115int sock_no_setsockopt(struct socket *sock, int level, int optname,
2116		    char __user *optval, unsigned int optlen)
2117{
2118	return -EOPNOTSUPP;
2119}
2120EXPORT_SYMBOL(sock_no_setsockopt);
2121
2122int sock_no_getsockopt(struct socket *sock, int level, int optname,
2123		    char __user *optval, int __user *optlen)
2124{
2125	return -EOPNOTSUPP;
2126}
2127EXPORT_SYMBOL(sock_no_getsockopt);
2128
2129int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2130		    size_t len)
2131{
2132	return -EOPNOTSUPP;
2133}
2134EXPORT_SYMBOL(sock_no_sendmsg);
2135
2136int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2137		    size_t len, int flags)
2138{
2139	return -EOPNOTSUPP;
2140}
2141EXPORT_SYMBOL(sock_no_recvmsg);
2142
2143int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2144{
2145	/* Mirror missing mmap method error code */
2146	return -ENODEV;
2147}
2148EXPORT_SYMBOL(sock_no_mmap);
2149
2150ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2151{
2152	ssize_t res;
2153	struct msghdr msg = {.msg_flags = flags};
2154	struct kvec iov;
2155	char *kaddr = kmap(page);
2156	iov.iov_base = kaddr + offset;
2157	iov.iov_len = size;
2158	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2159	kunmap(page);
2160	return res;
2161}
2162EXPORT_SYMBOL(sock_no_sendpage);
2163
2164/*
2165 *	Default Socket Callbacks
2166 */
2167
2168static void sock_def_wakeup(struct sock *sk)
2169{
2170	struct socket_wq *wq;
2171
2172	rcu_read_lock();
2173	wq = rcu_dereference(sk->sk_wq);
2174	if (wq_has_sleeper(wq))
2175		wake_up_interruptible_all(&wq->wait);
2176	rcu_read_unlock();
2177}
2178
2179static void sock_def_error_report(struct sock *sk)
2180{
2181	struct socket_wq *wq;
2182
2183	rcu_read_lock();
2184	wq = rcu_dereference(sk->sk_wq);
2185	if (wq_has_sleeper(wq))
2186		wake_up_interruptible_poll(&wq->wait, POLLERR);
2187	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2188	rcu_read_unlock();
2189}
2190
2191static void sock_def_readable(struct sock *sk)
2192{
2193	struct socket_wq *wq;
2194
2195	rcu_read_lock();
2196	wq = rcu_dereference(sk->sk_wq);
2197	if (wq_has_sleeper(wq))
2198		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2199						POLLRDNORM | POLLRDBAND);
2200	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2201	rcu_read_unlock();
2202}
2203
2204static void sock_def_write_space(struct sock *sk)
2205{
2206	struct socket_wq *wq;
2207
2208	rcu_read_lock();
2209
2210	/* Do not wake up a writer until he can make "significant"
2211	 * progress.  --DaveM
2212	 */
2213	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2214		wq = rcu_dereference(sk->sk_wq);
2215		if (wq_has_sleeper(wq))
2216			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2217						POLLWRNORM | POLLWRBAND);
2218
2219		/* Should agree with poll, otherwise some programs break */
2220		if (sock_writeable(sk))
2221			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2222	}
2223
2224	rcu_read_unlock();
2225}
2226
2227static void sock_def_destruct(struct sock *sk)
2228{
2229	kfree(sk->sk_protinfo);
2230}
2231
2232void sk_send_sigurg(struct sock *sk)
2233{
2234	if (sk->sk_socket && sk->sk_socket->file)
2235		if (send_sigurg(&sk->sk_socket->file->f_owner))
2236			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2237}
2238EXPORT_SYMBOL(sk_send_sigurg);
2239
2240void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2241		    unsigned long expires)
2242{
2243	if (!mod_timer(timer, expires))
2244		sock_hold(sk);
2245}
2246EXPORT_SYMBOL(sk_reset_timer);
2247
2248void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2249{
2250	if (del_timer(timer))
2251		__sock_put(sk);
2252}
2253EXPORT_SYMBOL(sk_stop_timer);
2254
2255void sock_init_data(struct socket *sock, struct sock *sk)
2256{
2257	skb_queue_head_init(&sk->sk_receive_queue);
2258	skb_queue_head_init(&sk->sk_write_queue);
2259	skb_queue_head_init(&sk->sk_error_queue);
2260
2261	sk->sk_send_head	=	NULL;
2262
2263	init_timer(&sk->sk_timer);
2264
2265	sk->sk_allocation	=	GFP_KERNEL;
2266	sk->sk_rcvbuf		=	sysctl_rmem_default;
2267	sk->sk_sndbuf		=	sysctl_wmem_default;
2268	sk->sk_state		=	TCP_CLOSE;
2269	sk_set_socket(sk, sock);
2270
2271	sock_set_flag(sk, SOCK_ZAPPED);
2272
2273	if (sock) {
2274		sk->sk_type	=	sock->type;
2275		sk->sk_wq	=	sock->wq;
2276		sock->sk	=	sk;
2277	} else
2278		sk->sk_wq	=	NULL;
2279
2280	spin_lock_init(&sk->sk_dst_lock);
2281	rwlock_init(&sk->sk_callback_lock);
2282	lockdep_set_class_and_name(&sk->sk_callback_lock,
2283			af_callback_keys + sk->sk_family,
2284			af_family_clock_key_strings[sk->sk_family]);
2285
2286	sk->sk_state_change	=	sock_def_wakeup;
2287	sk->sk_data_ready	=	sock_def_readable;
2288	sk->sk_write_space	=	sock_def_write_space;
2289	sk->sk_error_report	=	sock_def_error_report;
2290	sk->sk_destruct		=	sock_def_destruct;
2291
2292	sk->sk_frag.page	=	NULL;
2293	sk->sk_frag.offset	=	0;
2294	sk->sk_peek_off		=	-1;
2295
2296	sk->sk_peer_pid 	=	NULL;
2297	sk->sk_peer_cred	=	NULL;
2298	sk->sk_write_pending	=	0;
2299	sk->sk_rcvlowat		=	1;
2300	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2301	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2302
2303	sk->sk_stamp = ktime_set(-1L, 0);
2304
2305#ifdef CONFIG_NET_RX_BUSY_POLL
2306	sk->sk_napi_id		=	0;
2307	sk->sk_ll_usec		=	sysctl_net_busy_read;
2308#endif
2309
2310	sk->sk_max_pacing_rate = ~0U;
2311	sk->sk_pacing_rate = ~0U;
2312	/*
2313	 * Before updating sk_refcnt, we must commit prior changes to memory
2314	 * (Documentation/RCU/rculist_nulls.txt for details)
2315	 */
2316	smp_wmb();
2317	atomic_set(&sk->sk_refcnt, 1);
2318	atomic_set(&sk->sk_drops, 0);
2319}
2320EXPORT_SYMBOL(sock_init_data);
2321
2322void lock_sock_nested(struct sock *sk, int subclass)
2323{
2324	might_sleep();
2325	spin_lock_bh(&sk->sk_lock.slock);
2326	if (sk->sk_lock.owned)
2327		__lock_sock(sk);
2328	sk->sk_lock.owned = 1;
2329	spin_unlock(&sk->sk_lock.slock);
2330	/*
2331	 * The sk_lock has mutex_lock() semantics here:
2332	 */
2333	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2334	local_bh_enable();
2335}
2336EXPORT_SYMBOL(lock_sock_nested);
2337
2338void release_sock(struct sock *sk)
2339{
2340	/*
2341	 * The sk_lock has mutex_unlock() semantics:
2342	 */
2343	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2344
2345	spin_lock_bh(&sk->sk_lock.slock);
2346	if (sk->sk_backlog.tail)
2347		__release_sock(sk);
2348
2349	/* Warning : release_cb() might need to release sk ownership,
2350	 * ie call sock_release_ownership(sk) before us.
2351	 */
2352	if (sk->sk_prot->release_cb)
2353		sk->sk_prot->release_cb(sk);
2354
2355	sock_release_ownership(sk);
2356	if (waitqueue_active(&sk->sk_lock.wq))
2357		wake_up(&sk->sk_lock.wq);
2358	spin_unlock_bh(&sk->sk_lock.slock);
2359}
2360EXPORT_SYMBOL(release_sock);
2361
2362/**
2363 * lock_sock_fast - fast version of lock_sock
2364 * @sk: socket
2365 *
2366 * This version should be used for very small section, where process wont block
2367 * return false if fast path is taken
2368 *   sk_lock.slock locked, owned = 0, BH disabled
2369 * return true if slow path is taken
2370 *   sk_lock.slock unlocked, owned = 1, BH enabled
2371 */
2372bool lock_sock_fast(struct sock *sk)
2373{
2374	might_sleep();
2375	spin_lock_bh(&sk->sk_lock.slock);
2376
2377	if (!sk->sk_lock.owned)
2378		/*
2379		 * Note : We must disable BH
2380		 */
2381		return false;
2382
2383	__lock_sock(sk);
2384	sk->sk_lock.owned = 1;
2385	spin_unlock(&sk->sk_lock.slock);
2386	/*
2387	 * The sk_lock has mutex_lock() semantics here:
2388	 */
2389	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2390	local_bh_enable();
2391	return true;
2392}
2393EXPORT_SYMBOL(lock_sock_fast);
2394
2395int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2396{
2397	struct timeval tv;
2398	if (!sock_flag(sk, SOCK_TIMESTAMP))
2399		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2400	tv = ktime_to_timeval(sk->sk_stamp);
2401	if (tv.tv_sec == -1)
2402		return -ENOENT;
2403	if (tv.tv_sec == 0) {
2404		sk->sk_stamp = ktime_get_real();
2405		tv = ktime_to_timeval(sk->sk_stamp);
2406	}
2407	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2408}
2409EXPORT_SYMBOL(sock_get_timestamp);
2410
2411int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2412{
2413	struct timespec ts;
2414	if (!sock_flag(sk, SOCK_TIMESTAMP))
2415		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2416	ts = ktime_to_timespec(sk->sk_stamp);
2417	if (ts.tv_sec == -1)
2418		return -ENOENT;
2419	if (ts.tv_sec == 0) {
2420		sk->sk_stamp = ktime_get_real();
2421		ts = ktime_to_timespec(sk->sk_stamp);
2422	}
2423	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2424}
2425EXPORT_SYMBOL(sock_get_timestampns);
2426
2427void sock_enable_timestamp(struct sock *sk, int flag)
2428{
2429	if (!sock_flag(sk, flag)) {
2430		unsigned long previous_flags = sk->sk_flags;
2431
2432		sock_set_flag(sk, flag);
2433		/*
2434		 * we just set one of the two flags which require net
2435		 * time stamping, but time stamping might have been on
2436		 * already because of the other one
2437		 */
2438		if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2439			net_enable_timestamp();
2440	}
2441}
2442
2443int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2444		       int level, int type)
2445{
2446	struct sock_exterr_skb *serr;
2447	struct sk_buff *skb;
2448	int copied, err;
2449
2450	err = -EAGAIN;
2451	skb = sock_dequeue_err_skb(sk);
2452	if (skb == NULL)
2453		goto out;
2454
2455	copied = skb->len;
2456	if (copied > len) {
2457		msg->msg_flags |= MSG_TRUNC;
2458		copied = len;
2459	}
2460	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2461	if (err)
2462		goto out_free_skb;
2463
2464	sock_recv_timestamp(msg, sk, skb);
2465
2466	serr = SKB_EXT_ERR(skb);
2467	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2468
2469	msg->msg_flags |= MSG_ERRQUEUE;
2470	err = copied;
2471
2472out_free_skb:
2473	kfree_skb(skb);
2474out:
2475	return err;
2476}
2477EXPORT_SYMBOL(sock_recv_errqueue);
2478
2479/*
2480 *	Get a socket option on an socket.
2481 *
2482 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2483 *	asynchronous errors should be reported by getsockopt. We assume
2484 *	this means if you specify SO_ERROR (otherwise whats the point of it).
2485 */
2486int sock_common_getsockopt(struct socket *sock, int level, int optname,
2487			   char __user *optval, int __user *optlen)
2488{
2489	struct sock *sk = sock->sk;
2490
2491	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2492}
2493EXPORT_SYMBOL(sock_common_getsockopt);
2494
2495#ifdef CONFIG_COMPAT
2496int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2497				  char __user *optval, int __user *optlen)
2498{
2499	struct sock *sk = sock->sk;
2500
2501	if (sk->sk_prot->compat_getsockopt != NULL)
2502		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2503						      optval, optlen);
2504	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2505}
2506EXPORT_SYMBOL(compat_sock_common_getsockopt);
2507#endif
2508
2509int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2510			struct msghdr *msg, size_t size, int flags)
2511{
2512	struct sock *sk = sock->sk;
2513	int addr_len = 0;
2514	int err;
2515
2516	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2517				   flags & ~MSG_DONTWAIT, &addr_len);
2518	if (err >= 0)
2519		msg->msg_namelen = addr_len;
2520	return err;
2521}
2522EXPORT_SYMBOL(sock_common_recvmsg);
2523
2524/*
2525 *	Set socket options on an inet socket.
2526 */
2527int sock_common_setsockopt(struct socket *sock, int level, int optname,
2528			   char __user *optval, unsigned int optlen)
2529{
2530	struct sock *sk = sock->sk;
2531
2532	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2533}
2534EXPORT_SYMBOL(sock_common_setsockopt);
2535
2536#ifdef CONFIG_COMPAT
2537int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2538				  char __user *optval, unsigned int optlen)
2539{
2540	struct sock *sk = sock->sk;
2541
2542	if (sk->sk_prot->compat_setsockopt != NULL)
2543		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2544						      optval, optlen);
2545	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2546}
2547EXPORT_SYMBOL(compat_sock_common_setsockopt);
2548#endif
2549
2550void sk_common_release(struct sock *sk)
2551{
2552	if (sk->sk_prot->destroy)
2553		sk->sk_prot->destroy(sk);
2554
2555	/*
2556	 * Observation: when sock_common_release is called, processes have
2557	 * no access to socket. But net still has.
2558	 * Step one, detach it from networking:
2559	 *
2560	 * A. Remove from hash tables.
2561	 */
2562
2563	sk->sk_prot->unhash(sk);
2564
2565	/*
2566	 * In this point socket cannot receive new packets, but it is possible
2567	 * that some packets are in flight because some CPU runs receiver and
2568	 * did hash table lookup before we unhashed socket. They will achieve
2569	 * receive queue and will be purged by socket destructor.
2570	 *
2571	 * Also we still have packets pending on receive queue and probably,
2572	 * our own packets waiting in device queues. sock_destroy will drain
2573	 * receive queue, but transmitted packets will delay socket destruction
2574	 * until the last reference will be released.
2575	 */
2576
2577	sock_orphan(sk);
2578
2579	xfrm_sk_free_policy(sk);
2580
2581	sk_refcnt_debug_release(sk);
2582
2583	if (sk->sk_frag.page) {
2584		put_page(sk->sk_frag.page);
2585		sk->sk_frag.page = NULL;
2586	}
2587
2588	sock_put(sk);
2589}
2590EXPORT_SYMBOL(sk_common_release);
2591
2592#ifdef CONFIG_PROC_FS
2593#define PROTO_INUSE_NR	64	/* should be enough for the first time */
2594struct prot_inuse {
2595	int val[PROTO_INUSE_NR];
2596};
2597
2598static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2599
2600#ifdef CONFIG_NET_NS
2601void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2602{
2603	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2604}
2605EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2606
2607int sock_prot_inuse_get(struct net *net, struct proto *prot)
2608{
2609	int cpu, idx = prot->inuse_idx;
2610	int res = 0;
2611
2612	for_each_possible_cpu(cpu)
2613		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2614
2615	return res >= 0 ? res : 0;
2616}
2617EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2618
2619static int __net_init sock_inuse_init_net(struct net *net)
2620{
2621	net->core.inuse = alloc_percpu(struct prot_inuse);
2622	return net->core.inuse ? 0 : -ENOMEM;
2623}
2624
2625static void __net_exit sock_inuse_exit_net(struct net *net)
2626{
2627	free_percpu(net->core.inuse);
2628}
2629
2630static struct pernet_operations net_inuse_ops = {
2631	.init = sock_inuse_init_net,
2632	.exit = sock_inuse_exit_net,
2633};
2634
2635static __init int net_inuse_init(void)
2636{
2637	if (register_pernet_subsys(&net_inuse_ops))
2638		panic("Cannot initialize net inuse counters");
2639
2640	return 0;
2641}
2642
2643core_initcall(net_inuse_init);
2644#else
2645static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2646
2647void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2648{
2649	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2650}
2651EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2652
2653int sock_prot_inuse_get(struct net *net, struct proto *prot)
2654{
2655	int cpu, idx = prot->inuse_idx;
2656	int res = 0;
2657
2658	for_each_possible_cpu(cpu)
2659		res += per_cpu(prot_inuse, cpu).val[idx];
2660
2661	return res >= 0 ? res : 0;
2662}
2663EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2664#endif
2665
2666static void assign_proto_idx(struct proto *prot)
2667{
2668	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2669
2670	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2671		pr_err("PROTO_INUSE_NR exhausted\n");
2672		return;
2673	}
2674
2675	set_bit(prot->inuse_idx, proto_inuse_idx);
2676}
2677
2678static void release_proto_idx(struct proto *prot)
2679{
2680	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2681		clear_bit(prot->inuse_idx, proto_inuse_idx);
2682}
2683#else
2684static inline void assign_proto_idx(struct proto *prot)
2685{
2686}
2687
2688static inline void release_proto_idx(struct proto *prot)
2689{
2690}
2691#endif
2692
2693int proto_register(struct proto *prot, int alloc_slab)
2694{
2695	if (alloc_slab) {
2696		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2697					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2698					NULL);
2699
2700		if (prot->slab == NULL) {
2701			pr_crit("%s: Can't create sock SLAB cache!\n",
2702				prot->name);
2703			goto out;
2704		}
2705
2706		if (prot->rsk_prot != NULL) {
2707			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2708			if (prot->rsk_prot->slab_name == NULL)
2709				goto out_free_sock_slab;
2710
2711			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2712								 prot->rsk_prot->obj_size, 0,
2713								 SLAB_HWCACHE_ALIGN, NULL);
2714
2715			if (prot->rsk_prot->slab == NULL) {
2716				pr_crit("%s: Can't create request sock SLAB cache!\n",
2717					prot->name);
2718				goto out_free_request_sock_slab_name;
2719			}
2720		}
2721
2722		if (prot->twsk_prot != NULL) {
2723			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2724
2725			if (prot->twsk_prot->twsk_slab_name == NULL)
2726				goto out_free_request_sock_slab;
2727
2728			prot->twsk_prot->twsk_slab =
2729				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2730						  prot->twsk_prot->twsk_obj_size,
2731						  0,
2732						  SLAB_HWCACHE_ALIGN |
2733							prot->slab_flags,
2734						  NULL);
2735			if (prot->twsk_prot->twsk_slab == NULL)
2736				goto out_free_timewait_sock_slab_name;
2737		}
2738	}
2739
2740	mutex_lock(&proto_list_mutex);
2741	list_add(&prot->node, &proto_list);
2742	assign_proto_idx(prot);
2743	mutex_unlock(&proto_list_mutex);
2744	return 0;
2745
2746out_free_timewait_sock_slab_name:
2747	kfree(prot->twsk_prot->twsk_slab_name);
2748out_free_request_sock_slab:
2749	if (prot->rsk_prot && prot->rsk_prot->slab) {
2750		kmem_cache_destroy(prot->rsk_prot->slab);
2751		prot->rsk_prot->slab = NULL;
2752	}
2753out_free_request_sock_slab_name:
2754	if (prot->rsk_prot)
2755		kfree(prot->rsk_prot->slab_name);
2756out_free_sock_slab:
2757	kmem_cache_destroy(prot->slab);
2758	prot->slab = NULL;
2759out:
2760	return -ENOBUFS;
2761}
2762EXPORT_SYMBOL(proto_register);
2763
2764void proto_unregister(struct proto *prot)
2765{
2766	mutex_lock(&proto_list_mutex);
2767	release_proto_idx(prot);
2768	list_del(&prot->node);
2769	mutex_unlock(&proto_list_mutex);
2770
2771	if (prot->slab != NULL) {
2772		kmem_cache_destroy(prot->slab);
2773		prot->slab = NULL;
2774	}
2775
2776	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2777		kmem_cache_destroy(prot->rsk_prot->slab);
2778		kfree(prot->rsk_prot->slab_name);
2779		prot->rsk_prot->slab = NULL;
2780	}
2781
2782	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2783		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2784		kfree(prot->twsk_prot->twsk_slab_name);
2785		prot->twsk_prot->twsk_slab = NULL;
2786	}
2787}
2788EXPORT_SYMBOL(proto_unregister);
2789
2790#ifdef CONFIG_PROC_FS
2791static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2792	__acquires(proto_list_mutex)
2793{
2794	mutex_lock(&proto_list_mutex);
2795	return seq_list_start_head(&proto_list, *pos);
2796}
2797
2798static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2799{
2800	return seq_list_next(v, &proto_list, pos);
2801}
2802
2803static void proto_seq_stop(struct seq_file *seq, void *v)
2804	__releases(proto_list_mutex)
2805{
2806	mutex_unlock(&proto_list_mutex);
2807}
2808
2809static char proto_method_implemented(const void *method)
2810{
2811	return method == NULL ? 'n' : 'y';
2812}
2813static long sock_prot_memory_allocated(struct proto *proto)
2814{
2815	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2816}
2817
2818static char *sock_prot_memory_pressure(struct proto *proto)
2819{
2820	return proto->memory_pressure != NULL ?
2821	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2822}
2823
2824static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2825{
2826
2827	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2828			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2829		   proto->name,
2830		   proto->obj_size,
2831		   sock_prot_inuse_get(seq_file_net(seq), proto),
2832		   sock_prot_memory_allocated(proto),
2833		   sock_prot_memory_pressure(proto),
2834		   proto->max_header,
2835		   proto->slab == NULL ? "no" : "yes",
2836		   module_name(proto->owner),
2837		   proto_method_implemented(proto->close),
2838		   proto_method_implemented(proto->connect),
2839		   proto_method_implemented(proto->disconnect),
2840		   proto_method_implemented(proto->accept),
2841		   proto_method_implemented(proto->ioctl),
2842		   proto_method_implemented(proto->init),
2843		   proto_method_implemented(proto->destroy),
2844		   proto_method_implemented(proto->shutdown),
2845		   proto_method_implemented(proto->setsockopt),
2846		   proto_method_implemented(proto->getsockopt),
2847		   proto_method_implemented(proto->sendmsg),
2848		   proto_method_implemented(proto->recvmsg),
2849		   proto_method_implemented(proto->sendpage),
2850		   proto_method_implemented(proto->bind),
2851		   proto_method_implemented(proto->backlog_rcv),
2852		   proto_method_implemented(proto->hash),
2853		   proto_method_implemented(proto->unhash),
2854		   proto_method_implemented(proto->get_port),
2855		   proto_method_implemented(proto->enter_memory_pressure));
2856}
2857
2858static int proto_seq_show(struct seq_file *seq, void *v)
2859{
2860	if (v == &proto_list)
2861		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2862			   "protocol",
2863			   "size",
2864			   "sockets",
2865			   "memory",
2866			   "press",
2867			   "maxhdr",
2868			   "slab",
2869			   "module",
2870			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2871	else
2872		proto_seq_printf(seq, list_entry(v, struct proto, node));
2873	return 0;
2874}
2875
2876static const struct seq_operations proto_seq_ops = {
2877	.start  = proto_seq_start,
2878	.next   = proto_seq_next,
2879	.stop   = proto_seq_stop,
2880	.show   = proto_seq_show,
2881};
2882
2883static int proto_seq_open(struct inode *inode, struct file *file)
2884{
2885	return seq_open_net(inode, file, &proto_seq_ops,
2886			    sizeof(struct seq_net_private));
2887}
2888
2889static const struct file_operations proto_seq_fops = {
2890	.owner		= THIS_MODULE,
2891	.open		= proto_seq_open,
2892	.read		= seq_read,
2893	.llseek		= seq_lseek,
2894	.release	= seq_release_net,
2895};
2896
2897static __net_init int proto_init_net(struct net *net)
2898{
2899	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2900		return -ENOMEM;
2901
2902	return 0;
2903}
2904
2905static __net_exit void proto_exit_net(struct net *net)
2906{
2907	remove_proc_entry("protocols", net->proc_net);
2908}
2909
2910
2911static __net_initdata struct pernet_operations proto_net_ops = {
2912	.init = proto_init_net,
2913	.exit = proto_exit_net,
2914};
2915
2916static int __init proto_init(void)
2917{
2918	return register_pernet_subsys(&proto_net_ops);
2919}
2920
2921subsys_initcall(proto_init);
2922
2923#endif /* PROC_FS */
2924