sock.c revision 4d276eb6a478307a28ae843836c455bf04b37a3c
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 *		Alan Cox	: 	Numerous verify_area() problems
17 *		Alan Cox	:	Connecting on a connecting socket
18 *					now returns an error for tcp.
19 *		Alan Cox	:	sock->protocol is set correctly.
20 *					and is not sometimes left as 0.
21 *		Alan Cox	:	connect handles icmp errors on a
22 *					connect properly. Unfortunately there
23 *					is a restart syscall nasty there. I
24 *					can't match BSD without hacking the C
25 *					library. Ideas urgently sought!
26 *		Alan Cox	:	Disallow bind() to addresses that are
27 *					not ours - especially broadcast ones!!
28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30 *					instead they leave that for the DESTROY timer.
31 *		Alan Cox	:	Clean up error flag in accept
32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33 *					was buggy. Put a remove_sock() in the handler
34 *					for memory when we hit 0. Also altered the timer
35 *					code. The ACK stuff can wait and needs major
36 *					TCP layer surgery.
37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38 *					and fixed timer/inet_bh race.
39 *		Alan Cox	:	Added zapped flag for TCP
40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47 *	Pauline Middelink	:	identd support
48 *		Alan Cox	:	Fixed connect() taking signals I think.
49 *		Alan Cox	:	SO_LINGER supported
50 *		Alan Cox	:	Error reporting fixes
51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52 *		Alan Cox	:	inet sockets don't set sk->type!
53 *		Alan Cox	:	Split socket option code
54 *		Alan Cox	:	Callbacks
55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56 *		Alex		:	Removed restriction on inet fioctl
57 *		Alan Cox	:	Splitting INET from NET core
58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60 *		Alan Cox	:	Split IP from generic code
61 *		Alan Cox	:	New kfree_skbmem()
62 *		Alan Cox	:	Make SO_DEBUG superuser only.
63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64 *					(compatibility fix)
65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66 *		Alan Cox	:	Allocator for a socket is settable.
67 *		Alan Cox	:	SO_ERROR includes soft errors.
68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69 *		Alan Cox	: 	Generic socket allocation to make hooks
70 *					easier (suggested by Craig Metz).
71 *		Michael Pall	:	SO_ERROR returns positive errno again
72 *              Steve Whitehouse:       Added default destructor to free
73 *                                      protocol private data.
74 *              Steve Whitehouse:       Added various other default routines
75 *                                      common to several socket families.
76 *              Chris Evans     :       Call suser() check last on F_SETOWN
77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79 *		Andi Kleen	:	Fix write_space callback
80 *		Chris Evans	:	Security fixes - signedness again
81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 *		This program is free software; you can redistribute it and/or
87 *		modify it under the terms of the GNU General Public License
88 *		as published by the Free Software Foundation; either version
89 *		2 of the License, or (at your option) any later version.
90 */
91
92#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
94#include <linux/capability.h>
95#include <linux/errno.h>
96#include <linux/errqueue.h>
97#include <linux/types.h>
98#include <linux/socket.h>
99#include <linux/in.h>
100#include <linux/kernel.h>
101#include <linux/module.h>
102#include <linux/proc_fs.h>
103#include <linux/seq_file.h>
104#include <linux/sched.h>
105#include <linux/timer.h>
106#include <linux/string.h>
107#include <linux/sockios.h>
108#include <linux/net.h>
109#include <linux/mm.h>
110#include <linux/slab.h>
111#include <linux/interrupt.h>
112#include <linux/poll.h>
113#include <linux/tcp.h>
114#include <linux/init.h>
115#include <linux/highmem.h>
116#include <linux/user_namespace.h>
117#include <linux/static_key.h>
118#include <linux/memcontrol.h>
119#include <linux/prefetch.h>
120
121#include <asm/uaccess.h>
122
123#include <linux/netdevice.h>
124#include <net/protocol.h>
125#include <linux/skbuff.h>
126#include <net/net_namespace.h>
127#include <net/request_sock.h>
128#include <net/sock.h>
129#include <linux/net_tstamp.h>
130#include <net/xfrm.h>
131#include <linux/ipsec.h>
132#include <net/cls_cgroup.h>
133#include <net/netprio_cgroup.h>
134
135#include <linux/filter.h>
136
137#include <trace/events/sock.h>
138
139#ifdef CONFIG_INET
140#include <net/tcp.h>
141#endif
142
143#include <net/busy_poll.h>
144
145static DEFINE_MUTEX(proto_list_mutex);
146static LIST_HEAD(proto_list);
147
148/**
149 * sk_ns_capable - General socket capability test
150 * @sk: Socket to use a capability on or through
151 * @user_ns: The user namespace of the capability to use
152 * @cap: The capability to use
153 *
154 * Test to see if the opener of the socket had when the socket was
155 * created and the current process has the capability @cap in the user
156 * namespace @user_ns.
157 */
158bool sk_ns_capable(const struct sock *sk,
159		   struct user_namespace *user_ns, int cap)
160{
161	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
162		ns_capable(user_ns, cap);
163}
164EXPORT_SYMBOL(sk_ns_capable);
165
166/**
167 * sk_capable - Socket global capability test
168 * @sk: Socket to use a capability on or through
169 * @cap: The global capbility to use
170 *
171 * Test to see if the opener of the socket had when the socket was
172 * created and the current process has the capability @cap in all user
173 * namespaces.
174 */
175bool sk_capable(const struct sock *sk, int cap)
176{
177	return sk_ns_capable(sk, &init_user_ns, cap);
178}
179EXPORT_SYMBOL(sk_capable);
180
181/**
182 * sk_net_capable - Network namespace socket capability test
183 * @sk: Socket to use a capability on or through
184 * @cap: The capability to use
185 *
186 * Test to see if the opener of the socket had when the socke was created
187 * and the current process has the capability @cap over the network namespace
188 * the socket is a member of.
189 */
190bool sk_net_capable(const struct sock *sk, int cap)
191{
192	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
193}
194EXPORT_SYMBOL(sk_net_capable);
195
196
197#ifdef CONFIG_MEMCG_KMEM
198int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
199{
200	struct proto *proto;
201	int ret = 0;
202
203	mutex_lock(&proto_list_mutex);
204	list_for_each_entry(proto, &proto_list, node) {
205		if (proto->init_cgroup) {
206			ret = proto->init_cgroup(memcg, ss);
207			if (ret)
208				goto out;
209		}
210	}
211
212	mutex_unlock(&proto_list_mutex);
213	return ret;
214out:
215	list_for_each_entry_continue_reverse(proto, &proto_list, node)
216		if (proto->destroy_cgroup)
217			proto->destroy_cgroup(memcg);
218	mutex_unlock(&proto_list_mutex);
219	return ret;
220}
221
222void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
223{
224	struct proto *proto;
225
226	mutex_lock(&proto_list_mutex);
227	list_for_each_entry_reverse(proto, &proto_list, node)
228		if (proto->destroy_cgroup)
229			proto->destroy_cgroup(memcg);
230	mutex_unlock(&proto_list_mutex);
231}
232#endif
233
234/*
235 * Each address family might have different locking rules, so we have
236 * one slock key per address family:
237 */
238static struct lock_class_key af_family_keys[AF_MAX];
239static struct lock_class_key af_family_slock_keys[AF_MAX];
240
241#if defined(CONFIG_MEMCG_KMEM)
242struct static_key memcg_socket_limit_enabled;
243EXPORT_SYMBOL(memcg_socket_limit_enabled);
244#endif
245
246/*
247 * Make lock validator output more readable. (we pre-construct these
248 * strings build-time, so that runtime initialization of socket
249 * locks is fast):
250 */
251static const char *const af_family_key_strings[AF_MAX+1] = {
252  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
253  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
254  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
255  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
256  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
257  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
258  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
259  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
260  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
261  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
262  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
263  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
264  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
265  "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
266};
267static const char *const af_family_slock_key_strings[AF_MAX+1] = {
268  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
269  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
270  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
271  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
272  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
273  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
274  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
275  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
276  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
277  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
278  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
279  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
280  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
281  "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
282};
283static const char *const af_family_clock_key_strings[AF_MAX+1] = {
284  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
285  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
286  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
287  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
288  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
289  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
290  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
291  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
292  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
293  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
294  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
295  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
296  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
297  "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
298};
299
300/*
301 * sk_callback_lock locking rules are per-address-family,
302 * so split the lock classes by using a per-AF key:
303 */
304static struct lock_class_key af_callback_keys[AF_MAX];
305
306/* Take into consideration the size of the struct sk_buff overhead in the
307 * determination of these values, since that is non-constant across
308 * platforms.  This makes socket queueing behavior and performance
309 * not depend upon such differences.
310 */
311#define _SK_MEM_PACKETS		256
312#define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
313#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
314#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
315
316/* Run time adjustable parameters. */
317__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
318EXPORT_SYMBOL(sysctl_wmem_max);
319__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
320EXPORT_SYMBOL(sysctl_rmem_max);
321__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
322__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
323
324/* Maximal space eaten by iovec or ancillary data plus some space */
325int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
326EXPORT_SYMBOL(sysctl_optmem_max);
327
328struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
329EXPORT_SYMBOL_GPL(memalloc_socks);
330
331/**
332 * sk_set_memalloc - sets %SOCK_MEMALLOC
333 * @sk: socket to set it on
334 *
335 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
336 * It's the responsibility of the admin to adjust min_free_kbytes
337 * to meet the requirements
338 */
339void sk_set_memalloc(struct sock *sk)
340{
341	sock_set_flag(sk, SOCK_MEMALLOC);
342	sk->sk_allocation |= __GFP_MEMALLOC;
343	static_key_slow_inc(&memalloc_socks);
344}
345EXPORT_SYMBOL_GPL(sk_set_memalloc);
346
347void sk_clear_memalloc(struct sock *sk)
348{
349	sock_reset_flag(sk, SOCK_MEMALLOC);
350	sk->sk_allocation &= ~__GFP_MEMALLOC;
351	static_key_slow_dec(&memalloc_socks);
352
353	/*
354	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
355	 * progress of swapping. However, if SOCK_MEMALLOC is cleared while
356	 * it has rmem allocations there is a risk that the user of the
357	 * socket cannot make forward progress due to exceeding the rmem
358	 * limits. By rights, sk_clear_memalloc() should only be called
359	 * on sockets being torn down but warn and reset the accounting if
360	 * that assumption breaks.
361	 */
362	if (WARN_ON(sk->sk_forward_alloc))
363		sk_mem_reclaim(sk);
364}
365EXPORT_SYMBOL_GPL(sk_clear_memalloc);
366
367int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
368{
369	int ret;
370	unsigned long pflags = current->flags;
371
372	/* these should have been dropped before queueing */
373	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
374
375	current->flags |= PF_MEMALLOC;
376	ret = sk->sk_backlog_rcv(sk, skb);
377	tsk_restore_flags(current, pflags, PF_MEMALLOC);
378
379	return ret;
380}
381EXPORT_SYMBOL(__sk_backlog_rcv);
382
383static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
384{
385	struct timeval tv;
386
387	if (optlen < sizeof(tv))
388		return -EINVAL;
389	if (copy_from_user(&tv, optval, sizeof(tv)))
390		return -EFAULT;
391	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
392		return -EDOM;
393
394	if (tv.tv_sec < 0) {
395		static int warned __read_mostly;
396
397		*timeo_p = 0;
398		if (warned < 10 && net_ratelimit()) {
399			warned++;
400			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
401				__func__, current->comm, task_pid_nr(current));
402		}
403		return 0;
404	}
405	*timeo_p = MAX_SCHEDULE_TIMEOUT;
406	if (tv.tv_sec == 0 && tv.tv_usec == 0)
407		return 0;
408	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
409		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
410	return 0;
411}
412
413static void sock_warn_obsolete_bsdism(const char *name)
414{
415	static int warned;
416	static char warncomm[TASK_COMM_LEN];
417	if (strcmp(warncomm, current->comm) && warned < 5) {
418		strcpy(warncomm,  current->comm);
419		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
420			warncomm, name);
421		warned++;
422	}
423}
424
425#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
426
427static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
428{
429	if (sk->sk_flags & flags) {
430		sk->sk_flags &= ~flags;
431		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
432			net_disable_timestamp();
433	}
434}
435
436
437int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
438{
439	int err;
440	int skb_len;
441	unsigned long flags;
442	struct sk_buff_head *list = &sk->sk_receive_queue;
443
444	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
445		atomic_inc(&sk->sk_drops);
446		trace_sock_rcvqueue_full(sk, skb);
447		return -ENOMEM;
448	}
449
450	err = sk_filter(sk, skb);
451	if (err)
452		return err;
453
454	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
455		atomic_inc(&sk->sk_drops);
456		return -ENOBUFS;
457	}
458
459	skb->dev = NULL;
460	skb_set_owner_r(skb, sk);
461
462	/* Cache the SKB length before we tack it onto the receive
463	 * queue.  Once it is added it no longer belongs to us and
464	 * may be freed by other threads of control pulling packets
465	 * from the queue.
466	 */
467	skb_len = skb->len;
468
469	/* we escape from rcu protected region, make sure we dont leak
470	 * a norefcounted dst
471	 */
472	skb_dst_force(skb);
473
474	spin_lock_irqsave(&list->lock, flags);
475	skb->dropcount = atomic_read(&sk->sk_drops);
476	__skb_queue_tail(list, skb);
477	spin_unlock_irqrestore(&list->lock, flags);
478
479	if (!sock_flag(sk, SOCK_DEAD))
480		sk->sk_data_ready(sk);
481	return 0;
482}
483EXPORT_SYMBOL(sock_queue_rcv_skb);
484
485int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
486{
487	int rc = NET_RX_SUCCESS;
488
489	if (sk_filter(sk, skb))
490		goto discard_and_relse;
491
492	skb->dev = NULL;
493
494	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
495		atomic_inc(&sk->sk_drops);
496		goto discard_and_relse;
497	}
498	if (nested)
499		bh_lock_sock_nested(sk);
500	else
501		bh_lock_sock(sk);
502	if (!sock_owned_by_user(sk)) {
503		/*
504		 * trylock + unlock semantics:
505		 */
506		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
507
508		rc = sk_backlog_rcv(sk, skb);
509
510		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
511	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
512		bh_unlock_sock(sk);
513		atomic_inc(&sk->sk_drops);
514		goto discard_and_relse;
515	}
516
517	bh_unlock_sock(sk);
518out:
519	sock_put(sk);
520	return rc;
521discard_and_relse:
522	kfree_skb(skb);
523	goto out;
524}
525EXPORT_SYMBOL(sk_receive_skb);
526
527struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
528{
529	struct dst_entry *dst = __sk_dst_get(sk);
530
531	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
532		sk_tx_queue_clear(sk);
533		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
534		dst_release(dst);
535		return NULL;
536	}
537
538	return dst;
539}
540EXPORT_SYMBOL(__sk_dst_check);
541
542struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
543{
544	struct dst_entry *dst = sk_dst_get(sk);
545
546	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
547		sk_dst_reset(sk);
548		dst_release(dst);
549		return NULL;
550	}
551
552	return dst;
553}
554EXPORT_SYMBOL(sk_dst_check);
555
556static int sock_setbindtodevice(struct sock *sk, char __user *optval,
557				int optlen)
558{
559	int ret = -ENOPROTOOPT;
560#ifdef CONFIG_NETDEVICES
561	struct net *net = sock_net(sk);
562	char devname[IFNAMSIZ];
563	int index;
564
565	/* Sorry... */
566	ret = -EPERM;
567	if (!ns_capable(net->user_ns, CAP_NET_RAW))
568		goto out;
569
570	ret = -EINVAL;
571	if (optlen < 0)
572		goto out;
573
574	/* Bind this socket to a particular device like "eth0",
575	 * as specified in the passed interface name. If the
576	 * name is "" or the option length is zero the socket
577	 * is not bound.
578	 */
579	if (optlen > IFNAMSIZ - 1)
580		optlen = IFNAMSIZ - 1;
581	memset(devname, 0, sizeof(devname));
582
583	ret = -EFAULT;
584	if (copy_from_user(devname, optval, optlen))
585		goto out;
586
587	index = 0;
588	if (devname[0] != '\0') {
589		struct net_device *dev;
590
591		rcu_read_lock();
592		dev = dev_get_by_name_rcu(net, devname);
593		if (dev)
594			index = dev->ifindex;
595		rcu_read_unlock();
596		ret = -ENODEV;
597		if (!dev)
598			goto out;
599	}
600
601	lock_sock(sk);
602	sk->sk_bound_dev_if = index;
603	sk_dst_reset(sk);
604	release_sock(sk);
605
606	ret = 0;
607
608out:
609#endif
610
611	return ret;
612}
613
614static int sock_getbindtodevice(struct sock *sk, char __user *optval,
615				int __user *optlen, int len)
616{
617	int ret = -ENOPROTOOPT;
618#ifdef CONFIG_NETDEVICES
619	struct net *net = sock_net(sk);
620	char devname[IFNAMSIZ];
621
622	if (sk->sk_bound_dev_if == 0) {
623		len = 0;
624		goto zero;
625	}
626
627	ret = -EINVAL;
628	if (len < IFNAMSIZ)
629		goto out;
630
631	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
632	if (ret)
633		goto out;
634
635	len = strlen(devname) + 1;
636
637	ret = -EFAULT;
638	if (copy_to_user(optval, devname, len))
639		goto out;
640
641zero:
642	ret = -EFAULT;
643	if (put_user(len, optlen))
644		goto out;
645
646	ret = 0;
647
648out:
649#endif
650
651	return ret;
652}
653
654static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
655{
656	if (valbool)
657		sock_set_flag(sk, bit);
658	else
659		sock_reset_flag(sk, bit);
660}
661
662/*
663 *	This is meant for all protocols to use and covers goings on
664 *	at the socket level. Everything here is generic.
665 */
666
667int sock_setsockopt(struct socket *sock, int level, int optname,
668		    char __user *optval, unsigned int optlen)
669{
670	struct sock *sk = sock->sk;
671	int val;
672	int valbool;
673	struct linger ling;
674	int ret = 0;
675
676	/*
677	 *	Options without arguments
678	 */
679
680	if (optname == SO_BINDTODEVICE)
681		return sock_setbindtodevice(sk, optval, optlen);
682
683	if (optlen < sizeof(int))
684		return -EINVAL;
685
686	if (get_user(val, (int __user *)optval))
687		return -EFAULT;
688
689	valbool = val ? 1 : 0;
690
691	lock_sock(sk);
692
693	switch (optname) {
694	case SO_DEBUG:
695		if (val && !capable(CAP_NET_ADMIN))
696			ret = -EACCES;
697		else
698			sock_valbool_flag(sk, SOCK_DBG, valbool);
699		break;
700	case SO_REUSEADDR:
701		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
702		break;
703	case SO_REUSEPORT:
704		sk->sk_reuseport = valbool;
705		break;
706	case SO_TYPE:
707	case SO_PROTOCOL:
708	case SO_DOMAIN:
709	case SO_ERROR:
710		ret = -ENOPROTOOPT;
711		break;
712	case SO_DONTROUTE:
713		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
714		break;
715	case SO_BROADCAST:
716		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
717		break;
718	case SO_SNDBUF:
719		/* Don't error on this BSD doesn't and if you think
720		 * about it this is right. Otherwise apps have to
721		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
722		 * are treated in BSD as hints
723		 */
724		val = min_t(u32, val, sysctl_wmem_max);
725set_sndbuf:
726		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
727		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
728		/* Wake up sending tasks if we upped the value. */
729		sk->sk_write_space(sk);
730		break;
731
732	case SO_SNDBUFFORCE:
733		if (!capable(CAP_NET_ADMIN)) {
734			ret = -EPERM;
735			break;
736		}
737		goto set_sndbuf;
738
739	case SO_RCVBUF:
740		/* Don't error on this BSD doesn't and if you think
741		 * about it this is right. Otherwise apps have to
742		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
743		 * are treated in BSD as hints
744		 */
745		val = min_t(u32, val, sysctl_rmem_max);
746set_rcvbuf:
747		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
748		/*
749		 * We double it on the way in to account for
750		 * "struct sk_buff" etc. overhead.   Applications
751		 * assume that the SO_RCVBUF setting they make will
752		 * allow that much actual data to be received on that
753		 * socket.
754		 *
755		 * Applications are unaware that "struct sk_buff" and
756		 * other overheads allocate from the receive buffer
757		 * during socket buffer allocation.
758		 *
759		 * And after considering the possible alternatives,
760		 * returning the value we actually used in getsockopt
761		 * is the most desirable behavior.
762		 */
763		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
764		break;
765
766	case SO_RCVBUFFORCE:
767		if (!capable(CAP_NET_ADMIN)) {
768			ret = -EPERM;
769			break;
770		}
771		goto set_rcvbuf;
772
773	case SO_KEEPALIVE:
774#ifdef CONFIG_INET
775		if (sk->sk_protocol == IPPROTO_TCP &&
776		    sk->sk_type == SOCK_STREAM)
777			tcp_set_keepalive(sk, valbool);
778#endif
779		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
780		break;
781
782	case SO_OOBINLINE:
783		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
784		break;
785
786	case SO_NO_CHECK:
787		sk->sk_no_check_tx = valbool;
788		break;
789
790	case SO_PRIORITY:
791		if ((val >= 0 && val <= 6) ||
792		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
793			sk->sk_priority = val;
794		else
795			ret = -EPERM;
796		break;
797
798	case SO_LINGER:
799		if (optlen < sizeof(ling)) {
800			ret = -EINVAL;	/* 1003.1g */
801			break;
802		}
803		if (copy_from_user(&ling, optval, sizeof(ling))) {
804			ret = -EFAULT;
805			break;
806		}
807		if (!ling.l_onoff)
808			sock_reset_flag(sk, SOCK_LINGER);
809		else {
810#if (BITS_PER_LONG == 32)
811			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
812				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
813			else
814#endif
815				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
816			sock_set_flag(sk, SOCK_LINGER);
817		}
818		break;
819
820	case SO_BSDCOMPAT:
821		sock_warn_obsolete_bsdism("setsockopt");
822		break;
823
824	case SO_PASSCRED:
825		if (valbool)
826			set_bit(SOCK_PASSCRED, &sock->flags);
827		else
828			clear_bit(SOCK_PASSCRED, &sock->flags);
829		break;
830
831	case SO_TIMESTAMP:
832	case SO_TIMESTAMPNS:
833		if (valbool)  {
834			if (optname == SO_TIMESTAMP)
835				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
836			else
837				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
838			sock_set_flag(sk, SOCK_RCVTSTAMP);
839			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
840		} else {
841			sock_reset_flag(sk, SOCK_RCVTSTAMP);
842			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
843		}
844		break;
845
846	case SO_TIMESTAMPING:
847		if (val & ~SOF_TIMESTAMPING_MASK) {
848			ret = -EINVAL;
849			break;
850		}
851		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
852				  val & SOF_TIMESTAMPING_TX_HARDWARE);
853		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
854				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
855		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
856				  val & SOF_TIMESTAMPING_RX_HARDWARE);
857		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
858			sock_enable_timestamp(sk,
859					      SOCK_TIMESTAMPING_RX_SOFTWARE);
860		else
861			sock_disable_timestamp(sk,
862					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
863		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
864				  val & SOF_TIMESTAMPING_SOFTWARE);
865		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
866				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
867		break;
868
869	case SO_RCVLOWAT:
870		if (val < 0)
871			val = INT_MAX;
872		sk->sk_rcvlowat = val ? : 1;
873		break;
874
875	case SO_RCVTIMEO:
876		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
877		break;
878
879	case SO_SNDTIMEO:
880		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
881		break;
882
883	case SO_ATTACH_FILTER:
884		ret = -EINVAL;
885		if (optlen == sizeof(struct sock_fprog)) {
886			struct sock_fprog fprog;
887
888			ret = -EFAULT;
889			if (copy_from_user(&fprog, optval, sizeof(fprog)))
890				break;
891
892			ret = sk_attach_filter(&fprog, sk);
893		}
894		break;
895
896	case SO_DETACH_FILTER:
897		ret = sk_detach_filter(sk);
898		break;
899
900	case SO_LOCK_FILTER:
901		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
902			ret = -EPERM;
903		else
904			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
905		break;
906
907	case SO_PASSSEC:
908		if (valbool)
909			set_bit(SOCK_PASSSEC, &sock->flags);
910		else
911			clear_bit(SOCK_PASSSEC, &sock->flags);
912		break;
913	case SO_MARK:
914		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
915			ret = -EPERM;
916		else
917			sk->sk_mark = val;
918		break;
919
920		/* We implement the SO_SNDLOWAT etc to
921		   not be settable (1003.1g 5.3) */
922	case SO_RXQ_OVFL:
923		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
924		break;
925
926	case SO_WIFI_STATUS:
927		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
928		break;
929
930	case SO_PEEK_OFF:
931		if (sock->ops->set_peek_off)
932			ret = sock->ops->set_peek_off(sk, val);
933		else
934			ret = -EOPNOTSUPP;
935		break;
936
937	case SO_NOFCS:
938		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
939		break;
940
941	case SO_SELECT_ERR_QUEUE:
942		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
943		break;
944
945#ifdef CONFIG_NET_RX_BUSY_POLL
946	case SO_BUSY_POLL:
947		/* allow unprivileged users to decrease the value */
948		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
949			ret = -EPERM;
950		else {
951			if (val < 0)
952				ret = -EINVAL;
953			else
954				sk->sk_ll_usec = val;
955		}
956		break;
957#endif
958
959	case SO_MAX_PACING_RATE:
960		sk->sk_max_pacing_rate = val;
961		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
962					 sk->sk_max_pacing_rate);
963		break;
964
965	default:
966		ret = -ENOPROTOOPT;
967		break;
968	}
969	release_sock(sk);
970	return ret;
971}
972EXPORT_SYMBOL(sock_setsockopt);
973
974
975static void cred_to_ucred(struct pid *pid, const struct cred *cred,
976			  struct ucred *ucred)
977{
978	ucred->pid = pid_vnr(pid);
979	ucred->uid = ucred->gid = -1;
980	if (cred) {
981		struct user_namespace *current_ns = current_user_ns();
982
983		ucred->uid = from_kuid_munged(current_ns, cred->euid);
984		ucred->gid = from_kgid_munged(current_ns, cred->egid);
985	}
986}
987
988int sock_getsockopt(struct socket *sock, int level, int optname,
989		    char __user *optval, int __user *optlen)
990{
991	struct sock *sk = sock->sk;
992
993	union {
994		int val;
995		struct linger ling;
996		struct timeval tm;
997	} v;
998
999	int lv = sizeof(int);
1000	int len;
1001
1002	if (get_user(len, optlen))
1003		return -EFAULT;
1004	if (len < 0)
1005		return -EINVAL;
1006
1007	memset(&v, 0, sizeof(v));
1008
1009	switch (optname) {
1010	case SO_DEBUG:
1011		v.val = sock_flag(sk, SOCK_DBG);
1012		break;
1013
1014	case SO_DONTROUTE:
1015		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1016		break;
1017
1018	case SO_BROADCAST:
1019		v.val = sock_flag(sk, SOCK_BROADCAST);
1020		break;
1021
1022	case SO_SNDBUF:
1023		v.val = sk->sk_sndbuf;
1024		break;
1025
1026	case SO_RCVBUF:
1027		v.val = sk->sk_rcvbuf;
1028		break;
1029
1030	case SO_REUSEADDR:
1031		v.val = sk->sk_reuse;
1032		break;
1033
1034	case SO_REUSEPORT:
1035		v.val = sk->sk_reuseport;
1036		break;
1037
1038	case SO_KEEPALIVE:
1039		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1040		break;
1041
1042	case SO_TYPE:
1043		v.val = sk->sk_type;
1044		break;
1045
1046	case SO_PROTOCOL:
1047		v.val = sk->sk_protocol;
1048		break;
1049
1050	case SO_DOMAIN:
1051		v.val = sk->sk_family;
1052		break;
1053
1054	case SO_ERROR:
1055		v.val = -sock_error(sk);
1056		if (v.val == 0)
1057			v.val = xchg(&sk->sk_err_soft, 0);
1058		break;
1059
1060	case SO_OOBINLINE:
1061		v.val = sock_flag(sk, SOCK_URGINLINE);
1062		break;
1063
1064	case SO_NO_CHECK:
1065		v.val = sk->sk_no_check_tx;
1066		break;
1067
1068	case SO_PRIORITY:
1069		v.val = sk->sk_priority;
1070		break;
1071
1072	case SO_LINGER:
1073		lv		= sizeof(v.ling);
1074		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1075		v.ling.l_linger	= sk->sk_lingertime / HZ;
1076		break;
1077
1078	case SO_BSDCOMPAT:
1079		sock_warn_obsolete_bsdism("getsockopt");
1080		break;
1081
1082	case SO_TIMESTAMP:
1083		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1084				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1085		break;
1086
1087	case SO_TIMESTAMPNS:
1088		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1089		break;
1090
1091	case SO_TIMESTAMPING:
1092		v.val = 0;
1093		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
1094			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
1095		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
1096			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
1097		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
1098			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
1099		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
1100			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
1101		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
1102			v.val |= SOF_TIMESTAMPING_SOFTWARE;
1103		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
1104			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
1105		break;
1106
1107	case SO_RCVTIMEO:
1108		lv = sizeof(struct timeval);
1109		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1110			v.tm.tv_sec = 0;
1111			v.tm.tv_usec = 0;
1112		} else {
1113			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1114			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1115		}
1116		break;
1117
1118	case SO_SNDTIMEO:
1119		lv = sizeof(struct timeval);
1120		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1121			v.tm.tv_sec = 0;
1122			v.tm.tv_usec = 0;
1123		} else {
1124			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1125			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1126		}
1127		break;
1128
1129	case SO_RCVLOWAT:
1130		v.val = sk->sk_rcvlowat;
1131		break;
1132
1133	case SO_SNDLOWAT:
1134		v.val = 1;
1135		break;
1136
1137	case SO_PASSCRED:
1138		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1139		break;
1140
1141	case SO_PEERCRED:
1142	{
1143		struct ucred peercred;
1144		if (len > sizeof(peercred))
1145			len = sizeof(peercred);
1146		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1147		if (copy_to_user(optval, &peercred, len))
1148			return -EFAULT;
1149		goto lenout;
1150	}
1151
1152	case SO_PEERNAME:
1153	{
1154		char address[128];
1155
1156		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1157			return -ENOTCONN;
1158		if (lv < len)
1159			return -EINVAL;
1160		if (copy_to_user(optval, address, len))
1161			return -EFAULT;
1162		goto lenout;
1163	}
1164
1165	/* Dubious BSD thing... Probably nobody even uses it, but
1166	 * the UNIX standard wants it for whatever reason... -DaveM
1167	 */
1168	case SO_ACCEPTCONN:
1169		v.val = sk->sk_state == TCP_LISTEN;
1170		break;
1171
1172	case SO_PASSSEC:
1173		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1174		break;
1175
1176	case SO_PEERSEC:
1177		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1178
1179	case SO_MARK:
1180		v.val = sk->sk_mark;
1181		break;
1182
1183	case SO_RXQ_OVFL:
1184		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1185		break;
1186
1187	case SO_WIFI_STATUS:
1188		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1189		break;
1190
1191	case SO_PEEK_OFF:
1192		if (!sock->ops->set_peek_off)
1193			return -EOPNOTSUPP;
1194
1195		v.val = sk->sk_peek_off;
1196		break;
1197	case SO_NOFCS:
1198		v.val = sock_flag(sk, SOCK_NOFCS);
1199		break;
1200
1201	case SO_BINDTODEVICE:
1202		return sock_getbindtodevice(sk, optval, optlen, len);
1203
1204	case SO_GET_FILTER:
1205		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1206		if (len < 0)
1207			return len;
1208
1209		goto lenout;
1210
1211	case SO_LOCK_FILTER:
1212		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1213		break;
1214
1215	case SO_BPF_EXTENSIONS:
1216		v.val = bpf_tell_extensions();
1217		break;
1218
1219	case SO_SELECT_ERR_QUEUE:
1220		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1221		break;
1222
1223#ifdef CONFIG_NET_RX_BUSY_POLL
1224	case SO_BUSY_POLL:
1225		v.val = sk->sk_ll_usec;
1226		break;
1227#endif
1228
1229	case SO_MAX_PACING_RATE:
1230		v.val = sk->sk_max_pacing_rate;
1231		break;
1232
1233	default:
1234		return -ENOPROTOOPT;
1235	}
1236
1237	if (len > lv)
1238		len = lv;
1239	if (copy_to_user(optval, &v, len))
1240		return -EFAULT;
1241lenout:
1242	if (put_user(len, optlen))
1243		return -EFAULT;
1244	return 0;
1245}
1246
1247/*
1248 * Initialize an sk_lock.
1249 *
1250 * (We also register the sk_lock with the lock validator.)
1251 */
1252static inline void sock_lock_init(struct sock *sk)
1253{
1254	sock_lock_init_class_and_name(sk,
1255			af_family_slock_key_strings[sk->sk_family],
1256			af_family_slock_keys + sk->sk_family,
1257			af_family_key_strings[sk->sk_family],
1258			af_family_keys + sk->sk_family);
1259}
1260
1261/*
1262 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1263 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1264 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1265 */
1266static void sock_copy(struct sock *nsk, const struct sock *osk)
1267{
1268#ifdef CONFIG_SECURITY_NETWORK
1269	void *sptr = nsk->sk_security;
1270#endif
1271	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1272
1273	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1274	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1275
1276#ifdef CONFIG_SECURITY_NETWORK
1277	nsk->sk_security = sptr;
1278	security_sk_clone(osk, nsk);
1279#endif
1280}
1281
1282void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1283{
1284	unsigned long nulls1, nulls2;
1285
1286	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1287	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1288	if (nulls1 > nulls2)
1289		swap(nulls1, nulls2);
1290
1291	if (nulls1 != 0)
1292		memset((char *)sk, 0, nulls1);
1293	memset((char *)sk + nulls1 + sizeof(void *), 0,
1294	       nulls2 - nulls1 - sizeof(void *));
1295	memset((char *)sk + nulls2 + sizeof(void *), 0,
1296	       size - nulls2 - sizeof(void *));
1297}
1298EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1299
1300static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1301		int family)
1302{
1303	struct sock *sk;
1304	struct kmem_cache *slab;
1305
1306	slab = prot->slab;
1307	if (slab != NULL) {
1308		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1309		if (!sk)
1310			return sk;
1311		if (priority & __GFP_ZERO) {
1312			if (prot->clear_sk)
1313				prot->clear_sk(sk, prot->obj_size);
1314			else
1315				sk_prot_clear_nulls(sk, prot->obj_size);
1316		}
1317	} else
1318		sk = kmalloc(prot->obj_size, priority);
1319
1320	if (sk != NULL) {
1321		kmemcheck_annotate_bitfield(sk, flags);
1322
1323		if (security_sk_alloc(sk, family, priority))
1324			goto out_free;
1325
1326		if (!try_module_get(prot->owner))
1327			goto out_free_sec;
1328		sk_tx_queue_clear(sk);
1329	}
1330
1331	return sk;
1332
1333out_free_sec:
1334	security_sk_free(sk);
1335out_free:
1336	if (slab != NULL)
1337		kmem_cache_free(slab, sk);
1338	else
1339		kfree(sk);
1340	return NULL;
1341}
1342
1343static void sk_prot_free(struct proto *prot, struct sock *sk)
1344{
1345	struct kmem_cache *slab;
1346	struct module *owner;
1347
1348	owner = prot->owner;
1349	slab = prot->slab;
1350
1351	security_sk_free(sk);
1352	if (slab != NULL)
1353		kmem_cache_free(slab, sk);
1354	else
1355		kfree(sk);
1356	module_put(owner);
1357}
1358
1359#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
1360void sock_update_netprioidx(struct sock *sk)
1361{
1362	if (in_interrupt())
1363		return;
1364
1365	sk->sk_cgrp_prioidx = task_netprioidx(current);
1366}
1367EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1368#endif
1369
1370/**
1371 *	sk_alloc - All socket objects are allocated here
1372 *	@net: the applicable net namespace
1373 *	@family: protocol family
1374 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1375 *	@prot: struct proto associated with this new sock instance
1376 */
1377struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1378		      struct proto *prot)
1379{
1380	struct sock *sk;
1381
1382	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1383	if (sk) {
1384		sk->sk_family = family;
1385		/*
1386		 * See comment in struct sock definition to understand
1387		 * why we need sk_prot_creator -acme
1388		 */
1389		sk->sk_prot = sk->sk_prot_creator = prot;
1390		sock_lock_init(sk);
1391		sock_net_set(sk, get_net(net));
1392		atomic_set(&sk->sk_wmem_alloc, 1);
1393
1394		sock_update_classid(sk);
1395		sock_update_netprioidx(sk);
1396	}
1397
1398	return sk;
1399}
1400EXPORT_SYMBOL(sk_alloc);
1401
1402static void __sk_free(struct sock *sk)
1403{
1404	struct sk_filter *filter;
1405
1406	if (sk->sk_destruct)
1407		sk->sk_destruct(sk);
1408
1409	filter = rcu_dereference_check(sk->sk_filter,
1410				       atomic_read(&sk->sk_wmem_alloc) == 0);
1411	if (filter) {
1412		sk_filter_uncharge(sk, filter);
1413		RCU_INIT_POINTER(sk->sk_filter, NULL);
1414	}
1415
1416	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1417
1418	if (atomic_read(&sk->sk_omem_alloc))
1419		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1420			 __func__, atomic_read(&sk->sk_omem_alloc));
1421
1422	if (sk->sk_peer_cred)
1423		put_cred(sk->sk_peer_cred);
1424	put_pid(sk->sk_peer_pid);
1425	put_net(sock_net(sk));
1426	sk_prot_free(sk->sk_prot_creator, sk);
1427}
1428
1429void sk_free(struct sock *sk)
1430{
1431	/*
1432	 * We subtract one from sk_wmem_alloc and can know if
1433	 * some packets are still in some tx queue.
1434	 * If not null, sock_wfree() will call __sk_free(sk) later
1435	 */
1436	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1437		__sk_free(sk);
1438}
1439EXPORT_SYMBOL(sk_free);
1440
1441/*
1442 * Last sock_put should drop reference to sk->sk_net. It has already
1443 * been dropped in sk_change_net. Taking reference to stopping namespace
1444 * is not an option.
1445 * Take reference to a socket to remove it from hash _alive_ and after that
1446 * destroy it in the context of init_net.
1447 */
1448void sk_release_kernel(struct sock *sk)
1449{
1450	if (sk == NULL || sk->sk_socket == NULL)
1451		return;
1452
1453	sock_hold(sk);
1454	sock_release(sk->sk_socket);
1455	release_net(sock_net(sk));
1456	sock_net_set(sk, get_net(&init_net));
1457	sock_put(sk);
1458}
1459EXPORT_SYMBOL(sk_release_kernel);
1460
1461static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1462{
1463	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1464		sock_update_memcg(newsk);
1465}
1466
1467/**
1468 *	sk_clone_lock - clone a socket, and lock its clone
1469 *	@sk: the socket to clone
1470 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1471 *
1472 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1473 */
1474struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1475{
1476	struct sock *newsk;
1477
1478	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1479	if (newsk != NULL) {
1480		struct sk_filter *filter;
1481
1482		sock_copy(newsk, sk);
1483
1484		/* SANITY */
1485		get_net(sock_net(newsk));
1486		sk_node_init(&newsk->sk_node);
1487		sock_lock_init(newsk);
1488		bh_lock_sock(newsk);
1489		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1490		newsk->sk_backlog.len = 0;
1491
1492		atomic_set(&newsk->sk_rmem_alloc, 0);
1493		/*
1494		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1495		 */
1496		atomic_set(&newsk->sk_wmem_alloc, 1);
1497		atomic_set(&newsk->sk_omem_alloc, 0);
1498		skb_queue_head_init(&newsk->sk_receive_queue);
1499		skb_queue_head_init(&newsk->sk_write_queue);
1500#ifdef CONFIG_NET_DMA
1501		skb_queue_head_init(&newsk->sk_async_wait_queue);
1502#endif
1503
1504		spin_lock_init(&newsk->sk_dst_lock);
1505		rwlock_init(&newsk->sk_callback_lock);
1506		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1507				af_callback_keys + newsk->sk_family,
1508				af_family_clock_key_strings[newsk->sk_family]);
1509
1510		newsk->sk_dst_cache	= NULL;
1511		newsk->sk_wmem_queued	= 0;
1512		newsk->sk_forward_alloc = 0;
1513		newsk->sk_send_head	= NULL;
1514		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1515
1516		sock_reset_flag(newsk, SOCK_DONE);
1517		skb_queue_head_init(&newsk->sk_error_queue);
1518
1519		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1520		if (filter != NULL)
1521			sk_filter_charge(newsk, filter);
1522
1523		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1524			/* It is still raw copy of parent, so invalidate
1525			 * destructor and make plain sk_free() */
1526			newsk->sk_destruct = NULL;
1527			bh_unlock_sock(newsk);
1528			sk_free(newsk);
1529			newsk = NULL;
1530			goto out;
1531		}
1532
1533		newsk->sk_err	   = 0;
1534		newsk->sk_priority = 0;
1535		/*
1536		 * Before updating sk_refcnt, we must commit prior changes to memory
1537		 * (Documentation/RCU/rculist_nulls.txt for details)
1538		 */
1539		smp_wmb();
1540		atomic_set(&newsk->sk_refcnt, 2);
1541
1542		/*
1543		 * Increment the counter in the same struct proto as the master
1544		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1545		 * is the same as sk->sk_prot->socks, as this field was copied
1546		 * with memcpy).
1547		 *
1548		 * This _changes_ the previous behaviour, where
1549		 * tcp_create_openreq_child always was incrementing the
1550		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1551		 * to be taken into account in all callers. -acme
1552		 */
1553		sk_refcnt_debug_inc(newsk);
1554		sk_set_socket(newsk, NULL);
1555		newsk->sk_wq = NULL;
1556
1557		sk_update_clone(sk, newsk);
1558
1559		if (newsk->sk_prot->sockets_allocated)
1560			sk_sockets_allocated_inc(newsk);
1561
1562		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1563			net_enable_timestamp();
1564	}
1565out:
1566	return newsk;
1567}
1568EXPORT_SYMBOL_GPL(sk_clone_lock);
1569
1570void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1571{
1572	__sk_dst_set(sk, dst);
1573	sk->sk_route_caps = dst->dev->features;
1574	if (sk->sk_route_caps & NETIF_F_GSO)
1575		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1576	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1577	if (sk_can_gso(sk)) {
1578		if (dst->header_len) {
1579			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1580		} else {
1581			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1582			sk->sk_gso_max_size = dst->dev->gso_max_size;
1583			sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1584		}
1585	}
1586}
1587EXPORT_SYMBOL_GPL(sk_setup_caps);
1588
1589/*
1590 *	Simple resource managers for sockets.
1591 */
1592
1593
1594/*
1595 * Write buffer destructor automatically called from kfree_skb.
1596 */
1597void sock_wfree(struct sk_buff *skb)
1598{
1599	struct sock *sk = skb->sk;
1600	unsigned int len = skb->truesize;
1601
1602	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1603		/*
1604		 * Keep a reference on sk_wmem_alloc, this will be released
1605		 * after sk_write_space() call
1606		 */
1607		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1608		sk->sk_write_space(sk);
1609		len = 1;
1610	}
1611	/*
1612	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1613	 * could not do because of in-flight packets
1614	 */
1615	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1616		__sk_free(sk);
1617}
1618EXPORT_SYMBOL(sock_wfree);
1619
1620void skb_orphan_partial(struct sk_buff *skb)
1621{
1622	/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1623	 * so we do not completely orphan skb, but transfert all
1624	 * accounted bytes but one, to avoid unexpected reorders.
1625	 */
1626	if (skb->destructor == sock_wfree
1627#ifdef CONFIG_INET
1628	    || skb->destructor == tcp_wfree
1629#endif
1630		) {
1631		atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1632		skb->truesize = 1;
1633	} else {
1634		skb_orphan(skb);
1635	}
1636}
1637EXPORT_SYMBOL(skb_orphan_partial);
1638
1639/*
1640 * Read buffer destructor automatically called from kfree_skb.
1641 */
1642void sock_rfree(struct sk_buff *skb)
1643{
1644	struct sock *sk = skb->sk;
1645	unsigned int len = skb->truesize;
1646
1647	atomic_sub(len, &sk->sk_rmem_alloc);
1648	sk_mem_uncharge(sk, len);
1649}
1650EXPORT_SYMBOL(sock_rfree);
1651
1652void sock_edemux(struct sk_buff *skb)
1653{
1654	struct sock *sk = skb->sk;
1655
1656#ifdef CONFIG_INET
1657	if (sk->sk_state == TCP_TIME_WAIT)
1658		inet_twsk_put(inet_twsk(sk));
1659	else
1660#endif
1661		sock_put(sk);
1662}
1663EXPORT_SYMBOL(sock_edemux);
1664
1665kuid_t sock_i_uid(struct sock *sk)
1666{
1667	kuid_t uid;
1668
1669	read_lock_bh(&sk->sk_callback_lock);
1670	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1671	read_unlock_bh(&sk->sk_callback_lock);
1672	return uid;
1673}
1674EXPORT_SYMBOL(sock_i_uid);
1675
1676unsigned long sock_i_ino(struct sock *sk)
1677{
1678	unsigned long ino;
1679
1680	read_lock_bh(&sk->sk_callback_lock);
1681	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1682	read_unlock_bh(&sk->sk_callback_lock);
1683	return ino;
1684}
1685EXPORT_SYMBOL(sock_i_ino);
1686
1687/*
1688 * Allocate a skb from the socket's send buffer.
1689 */
1690struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1691			     gfp_t priority)
1692{
1693	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1694		struct sk_buff *skb = alloc_skb(size, priority);
1695		if (skb) {
1696			skb_set_owner_w(skb, sk);
1697			return skb;
1698		}
1699	}
1700	return NULL;
1701}
1702EXPORT_SYMBOL(sock_wmalloc);
1703
1704/*
1705 * Allocate a memory block from the socket's option memory buffer.
1706 */
1707void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1708{
1709	if ((unsigned int)size <= sysctl_optmem_max &&
1710	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1711		void *mem;
1712		/* First do the add, to avoid the race if kmalloc
1713		 * might sleep.
1714		 */
1715		atomic_add(size, &sk->sk_omem_alloc);
1716		mem = kmalloc(size, priority);
1717		if (mem)
1718			return mem;
1719		atomic_sub(size, &sk->sk_omem_alloc);
1720	}
1721	return NULL;
1722}
1723EXPORT_SYMBOL(sock_kmalloc);
1724
1725/*
1726 * Free an option memory block.
1727 */
1728void sock_kfree_s(struct sock *sk, void *mem, int size)
1729{
1730	kfree(mem);
1731	atomic_sub(size, &sk->sk_omem_alloc);
1732}
1733EXPORT_SYMBOL(sock_kfree_s);
1734
1735/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1736   I think, these locks should be removed for datagram sockets.
1737 */
1738static long sock_wait_for_wmem(struct sock *sk, long timeo)
1739{
1740	DEFINE_WAIT(wait);
1741
1742	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1743	for (;;) {
1744		if (!timeo)
1745			break;
1746		if (signal_pending(current))
1747			break;
1748		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1749		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1750		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1751			break;
1752		if (sk->sk_shutdown & SEND_SHUTDOWN)
1753			break;
1754		if (sk->sk_err)
1755			break;
1756		timeo = schedule_timeout(timeo);
1757	}
1758	finish_wait(sk_sleep(sk), &wait);
1759	return timeo;
1760}
1761
1762
1763/*
1764 *	Generic send/receive buffer handlers
1765 */
1766
1767struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1768				     unsigned long data_len, int noblock,
1769				     int *errcode, int max_page_order)
1770{
1771	struct sk_buff *skb = NULL;
1772	unsigned long chunk;
1773	gfp_t gfp_mask;
1774	long timeo;
1775	int err;
1776	int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1777	struct page *page;
1778	int i;
1779
1780	err = -EMSGSIZE;
1781	if (npages > MAX_SKB_FRAGS)
1782		goto failure;
1783
1784	timeo = sock_sndtimeo(sk, noblock);
1785	while (!skb) {
1786		err = sock_error(sk);
1787		if (err != 0)
1788			goto failure;
1789
1790		err = -EPIPE;
1791		if (sk->sk_shutdown & SEND_SHUTDOWN)
1792			goto failure;
1793
1794		if (atomic_read(&sk->sk_wmem_alloc) >= sk->sk_sndbuf) {
1795			set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1796			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1797			err = -EAGAIN;
1798			if (!timeo)
1799				goto failure;
1800			if (signal_pending(current))
1801				goto interrupted;
1802			timeo = sock_wait_for_wmem(sk, timeo);
1803			continue;
1804		}
1805
1806		err = -ENOBUFS;
1807		gfp_mask = sk->sk_allocation;
1808		if (gfp_mask & __GFP_WAIT)
1809			gfp_mask |= __GFP_REPEAT;
1810
1811		skb = alloc_skb(header_len, gfp_mask);
1812		if (!skb)
1813			goto failure;
1814
1815		skb->truesize += data_len;
1816
1817		for (i = 0; npages > 0; i++) {
1818			int order = max_page_order;
1819
1820			while (order) {
1821				if (npages >= 1 << order) {
1822					page = alloc_pages(sk->sk_allocation |
1823							   __GFP_COMP |
1824							   __GFP_NOWARN |
1825							   __GFP_NORETRY,
1826							   order);
1827					if (page)
1828						goto fill_page;
1829				}
1830				order--;
1831			}
1832			page = alloc_page(sk->sk_allocation);
1833			if (!page)
1834				goto failure;
1835fill_page:
1836			chunk = min_t(unsigned long, data_len,
1837				      PAGE_SIZE << order);
1838			skb_fill_page_desc(skb, i, page, 0, chunk);
1839			data_len -= chunk;
1840			npages -= 1 << order;
1841		}
1842	}
1843
1844	skb_set_owner_w(skb, sk);
1845	return skb;
1846
1847interrupted:
1848	err = sock_intr_errno(timeo);
1849failure:
1850	kfree_skb(skb);
1851	*errcode = err;
1852	return NULL;
1853}
1854EXPORT_SYMBOL(sock_alloc_send_pskb);
1855
1856struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1857				    int noblock, int *errcode)
1858{
1859	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1860}
1861EXPORT_SYMBOL(sock_alloc_send_skb);
1862
1863/* On 32bit arches, an skb frag is limited to 2^15 */
1864#define SKB_FRAG_PAGE_ORDER	get_order(32768)
1865
1866/**
1867 * skb_page_frag_refill - check that a page_frag contains enough room
1868 * @sz: minimum size of the fragment we want to get
1869 * @pfrag: pointer to page_frag
1870 * @prio: priority for memory allocation
1871 *
1872 * Note: While this allocator tries to use high order pages, there is
1873 * no guarantee that allocations succeed. Therefore, @sz MUST be
1874 * less or equal than PAGE_SIZE.
1875 */
1876bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
1877{
1878	int order;
1879
1880	if (pfrag->page) {
1881		if (atomic_read(&pfrag->page->_count) == 1) {
1882			pfrag->offset = 0;
1883			return true;
1884		}
1885		if (pfrag->offset + sz <= pfrag->size)
1886			return true;
1887		put_page(pfrag->page);
1888	}
1889
1890	order = SKB_FRAG_PAGE_ORDER;
1891	do {
1892		gfp_t gfp = prio;
1893
1894		if (order)
1895			gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
1896		pfrag->page = alloc_pages(gfp, order);
1897		if (likely(pfrag->page)) {
1898			pfrag->offset = 0;
1899			pfrag->size = PAGE_SIZE << order;
1900			return true;
1901		}
1902	} while (--order >= 0);
1903
1904	return false;
1905}
1906EXPORT_SYMBOL(skb_page_frag_refill);
1907
1908bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1909{
1910	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1911		return true;
1912
1913	sk_enter_memory_pressure(sk);
1914	sk_stream_moderate_sndbuf(sk);
1915	return false;
1916}
1917EXPORT_SYMBOL(sk_page_frag_refill);
1918
1919static void __lock_sock(struct sock *sk)
1920	__releases(&sk->sk_lock.slock)
1921	__acquires(&sk->sk_lock.slock)
1922{
1923	DEFINE_WAIT(wait);
1924
1925	for (;;) {
1926		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1927					TASK_UNINTERRUPTIBLE);
1928		spin_unlock_bh(&sk->sk_lock.slock);
1929		schedule();
1930		spin_lock_bh(&sk->sk_lock.slock);
1931		if (!sock_owned_by_user(sk))
1932			break;
1933	}
1934	finish_wait(&sk->sk_lock.wq, &wait);
1935}
1936
1937static void __release_sock(struct sock *sk)
1938	__releases(&sk->sk_lock.slock)
1939	__acquires(&sk->sk_lock.slock)
1940{
1941	struct sk_buff *skb = sk->sk_backlog.head;
1942
1943	do {
1944		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1945		bh_unlock_sock(sk);
1946
1947		do {
1948			struct sk_buff *next = skb->next;
1949
1950			prefetch(next);
1951			WARN_ON_ONCE(skb_dst_is_noref(skb));
1952			skb->next = NULL;
1953			sk_backlog_rcv(sk, skb);
1954
1955			/*
1956			 * We are in process context here with softirqs
1957			 * disabled, use cond_resched_softirq() to preempt.
1958			 * This is safe to do because we've taken the backlog
1959			 * queue private:
1960			 */
1961			cond_resched_softirq();
1962
1963			skb = next;
1964		} while (skb != NULL);
1965
1966		bh_lock_sock(sk);
1967	} while ((skb = sk->sk_backlog.head) != NULL);
1968
1969	/*
1970	 * Doing the zeroing here guarantee we can not loop forever
1971	 * while a wild producer attempts to flood us.
1972	 */
1973	sk->sk_backlog.len = 0;
1974}
1975
1976/**
1977 * sk_wait_data - wait for data to arrive at sk_receive_queue
1978 * @sk:    sock to wait on
1979 * @timeo: for how long
1980 *
1981 * Now socket state including sk->sk_err is changed only under lock,
1982 * hence we may omit checks after joining wait queue.
1983 * We check receive queue before schedule() only as optimization;
1984 * it is very likely that release_sock() added new data.
1985 */
1986int sk_wait_data(struct sock *sk, long *timeo)
1987{
1988	int rc;
1989	DEFINE_WAIT(wait);
1990
1991	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1992	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1993	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1994	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1995	finish_wait(sk_sleep(sk), &wait);
1996	return rc;
1997}
1998EXPORT_SYMBOL(sk_wait_data);
1999
2000/**
2001 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2002 *	@sk: socket
2003 *	@size: memory size to allocate
2004 *	@kind: allocation type
2005 *
2006 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2007 *	rmem allocation. This function assumes that protocols which have
2008 *	memory_pressure use sk_wmem_queued as write buffer accounting.
2009 */
2010int __sk_mem_schedule(struct sock *sk, int size, int kind)
2011{
2012	struct proto *prot = sk->sk_prot;
2013	int amt = sk_mem_pages(size);
2014	long allocated;
2015	int parent_status = UNDER_LIMIT;
2016
2017	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2018
2019	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
2020
2021	/* Under limit. */
2022	if (parent_status == UNDER_LIMIT &&
2023			allocated <= sk_prot_mem_limits(sk, 0)) {
2024		sk_leave_memory_pressure(sk);
2025		return 1;
2026	}
2027
2028	/* Under pressure. (we or our parents) */
2029	if ((parent_status > SOFT_LIMIT) ||
2030			allocated > sk_prot_mem_limits(sk, 1))
2031		sk_enter_memory_pressure(sk);
2032
2033	/* Over hard limit (we or our parents) */
2034	if ((parent_status == OVER_LIMIT) ||
2035			(allocated > sk_prot_mem_limits(sk, 2)))
2036		goto suppress_allocation;
2037
2038	/* guarantee minimum buffer size under pressure */
2039	if (kind == SK_MEM_RECV) {
2040		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2041			return 1;
2042
2043	} else { /* SK_MEM_SEND */
2044		if (sk->sk_type == SOCK_STREAM) {
2045			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2046				return 1;
2047		} else if (atomic_read(&sk->sk_wmem_alloc) <
2048			   prot->sysctl_wmem[0])
2049				return 1;
2050	}
2051
2052	if (sk_has_memory_pressure(sk)) {
2053		int alloc;
2054
2055		if (!sk_under_memory_pressure(sk))
2056			return 1;
2057		alloc = sk_sockets_allocated_read_positive(sk);
2058		if (sk_prot_mem_limits(sk, 2) > alloc *
2059		    sk_mem_pages(sk->sk_wmem_queued +
2060				 atomic_read(&sk->sk_rmem_alloc) +
2061				 sk->sk_forward_alloc))
2062			return 1;
2063	}
2064
2065suppress_allocation:
2066
2067	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2068		sk_stream_moderate_sndbuf(sk);
2069
2070		/* Fail only if socket is _under_ its sndbuf.
2071		 * In this case we cannot block, so that we have to fail.
2072		 */
2073		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2074			return 1;
2075	}
2076
2077	trace_sock_exceed_buf_limit(sk, prot, allocated);
2078
2079	/* Alas. Undo changes. */
2080	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2081
2082	sk_memory_allocated_sub(sk, amt);
2083
2084	return 0;
2085}
2086EXPORT_SYMBOL(__sk_mem_schedule);
2087
2088/**
2089 *	__sk_reclaim - reclaim memory_allocated
2090 *	@sk: socket
2091 */
2092void __sk_mem_reclaim(struct sock *sk)
2093{
2094	sk_memory_allocated_sub(sk,
2095				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2096	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2097
2098	if (sk_under_memory_pressure(sk) &&
2099	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2100		sk_leave_memory_pressure(sk);
2101}
2102EXPORT_SYMBOL(__sk_mem_reclaim);
2103
2104
2105/*
2106 * Set of default routines for initialising struct proto_ops when
2107 * the protocol does not support a particular function. In certain
2108 * cases where it makes no sense for a protocol to have a "do nothing"
2109 * function, some default processing is provided.
2110 */
2111
2112int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2113{
2114	return -EOPNOTSUPP;
2115}
2116EXPORT_SYMBOL(sock_no_bind);
2117
2118int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2119		    int len, int flags)
2120{
2121	return -EOPNOTSUPP;
2122}
2123EXPORT_SYMBOL(sock_no_connect);
2124
2125int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2126{
2127	return -EOPNOTSUPP;
2128}
2129EXPORT_SYMBOL(sock_no_socketpair);
2130
2131int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2132{
2133	return -EOPNOTSUPP;
2134}
2135EXPORT_SYMBOL(sock_no_accept);
2136
2137int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2138		    int *len, int peer)
2139{
2140	return -EOPNOTSUPP;
2141}
2142EXPORT_SYMBOL(sock_no_getname);
2143
2144unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2145{
2146	return 0;
2147}
2148EXPORT_SYMBOL(sock_no_poll);
2149
2150int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2151{
2152	return -EOPNOTSUPP;
2153}
2154EXPORT_SYMBOL(sock_no_ioctl);
2155
2156int sock_no_listen(struct socket *sock, int backlog)
2157{
2158	return -EOPNOTSUPP;
2159}
2160EXPORT_SYMBOL(sock_no_listen);
2161
2162int sock_no_shutdown(struct socket *sock, int how)
2163{
2164	return -EOPNOTSUPP;
2165}
2166EXPORT_SYMBOL(sock_no_shutdown);
2167
2168int sock_no_setsockopt(struct socket *sock, int level, int optname,
2169		    char __user *optval, unsigned int optlen)
2170{
2171	return -EOPNOTSUPP;
2172}
2173EXPORT_SYMBOL(sock_no_setsockopt);
2174
2175int sock_no_getsockopt(struct socket *sock, int level, int optname,
2176		    char __user *optval, int __user *optlen)
2177{
2178	return -EOPNOTSUPP;
2179}
2180EXPORT_SYMBOL(sock_no_getsockopt);
2181
2182int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2183		    size_t len)
2184{
2185	return -EOPNOTSUPP;
2186}
2187EXPORT_SYMBOL(sock_no_sendmsg);
2188
2189int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2190		    size_t len, int flags)
2191{
2192	return -EOPNOTSUPP;
2193}
2194EXPORT_SYMBOL(sock_no_recvmsg);
2195
2196int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2197{
2198	/* Mirror missing mmap method error code */
2199	return -ENODEV;
2200}
2201EXPORT_SYMBOL(sock_no_mmap);
2202
2203ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2204{
2205	ssize_t res;
2206	struct msghdr msg = {.msg_flags = flags};
2207	struct kvec iov;
2208	char *kaddr = kmap(page);
2209	iov.iov_base = kaddr + offset;
2210	iov.iov_len = size;
2211	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2212	kunmap(page);
2213	return res;
2214}
2215EXPORT_SYMBOL(sock_no_sendpage);
2216
2217/*
2218 *	Default Socket Callbacks
2219 */
2220
2221static void sock_def_wakeup(struct sock *sk)
2222{
2223	struct socket_wq *wq;
2224
2225	rcu_read_lock();
2226	wq = rcu_dereference(sk->sk_wq);
2227	if (wq_has_sleeper(wq))
2228		wake_up_interruptible_all(&wq->wait);
2229	rcu_read_unlock();
2230}
2231
2232static void sock_def_error_report(struct sock *sk)
2233{
2234	struct socket_wq *wq;
2235
2236	rcu_read_lock();
2237	wq = rcu_dereference(sk->sk_wq);
2238	if (wq_has_sleeper(wq))
2239		wake_up_interruptible_poll(&wq->wait, POLLERR);
2240	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2241	rcu_read_unlock();
2242}
2243
2244static void sock_def_readable(struct sock *sk)
2245{
2246	struct socket_wq *wq;
2247
2248	rcu_read_lock();
2249	wq = rcu_dereference(sk->sk_wq);
2250	if (wq_has_sleeper(wq))
2251		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2252						POLLRDNORM | POLLRDBAND);
2253	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2254	rcu_read_unlock();
2255}
2256
2257static void sock_def_write_space(struct sock *sk)
2258{
2259	struct socket_wq *wq;
2260
2261	rcu_read_lock();
2262
2263	/* Do not wake up a writer until he can make "significant"
2264	 * progress.  --DaveM
2265	 */
2266	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2267		wq = rcu_dereference(sk->sk_wq);
2268		if (wq_has_sleeper(wq))
2269			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2270						POLLWRNORM | POLLWRBAND);
2271
2272		/* Should agree with poll, otherwise some programs break */
2273		if (sock_writeable(sk))
2274			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2275	}
2276
2277	rcu_read_unlock();
2278}
2279
2280static void sock_def_destruct(struct sock *sk)
2281{
2282	kfree(sk->sk_protinfo);
2283}
2284
2285void sk_send_sigurg(struct sock *sk)
2286{
2287	if (sk->sk_socket && sk->sk_socket->file)
2288		if (send_sigurg(&sk->sk_socket->file->f_owner))
2289			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2290}
2291EXPORT_SYMBOL(sk_send_sigurg);
2292
2293void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2294		    unsigned long expires)
2295{
2296	if (!mod_timer(timer, expires))
2297		sock_hold(sk);
2298}
2299EXPORT_SYMBOL(sk_reset_timer);
2300
2301void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2302{
2303	if (del_timer(timer))
2304		__sock_put(sk);
2305}
2306EXPORT_SYMBOL(sk_stop_timer);
2307
2308void sock_init_data(struct socket *sock, struct sock *sk)
2309{
2310	skb_queue_head_init(&sk->sk_receive_queue);
2311	skb_queue_head_init(&sk->sk_write_queue);
2312	skb_queue_head_init(&sk->sk_error_queue);
2313#ifdef CONFIG_NET_DMA
2314	skb_queue_head_init(&sk->sk_async_wait_queue);
2315#endif
2316
2317	sk->sk_send_head	=	NULL;
2318
2319	init_timer(&sk->sk_timer);
2320
2321	sk->sk_allocation	=	GFP_KERNEL;
2322	sk->sk_rcvbuf		=	sysctl_rmem_default;
2323	sk->sk_sndbuf		=	sysctl_wmem_default;
2324	sk->sk_state		=	TCP_CLOSE;
2325	sk_set_socket(sk, sock);
2326
2327	sock_set_flag(sk, SOCK_ZAPPED);
2328
2329	if (sock) {
2330		sk->sk_type	=	sock->type;
2331		sk->sk_wq	=	sock->wq;
2332		sock->sk	=	sk;
2333	} else
2334		sk->sk_wq	=	NULL;
2335
2336	spin_lock_init(&sk->sk_dst_lock);
2337	rwlock_init(&sk->sk_callback_lock);
2338	lockdep_set_class_and_name(&sk->sk_callback_lock,
2339			af_callback_keys + sk->sk_family,
2340			af_family_clock_key_strings[sk->sk_family]);
2341
2342	sk->sk_state_change	=	sock_def_wakeup;
2343	sk->sk_data_ready	=	sock_def_readable;
2344	sk->sk_write_space	=	sock_def_write_space;
2345	sk->sk_error_report	=	sock_def_error_report;
2346	sk->sk_destruct		=	sock_def_destruct;
2347
2348	sk->sk_frag.page	=	NULL;
2349	sk->sk_frag.offset	=	0;
2350	sk->sk_peek_off		=	-1;
2351
2352	sk->sk_peer_pid 	=	NULL;
2353	sk->sk_peer_cred	=	NULL;
2354	sk->sk_write_pending	=	0;
2355	sk->sk_rcvlowat		=	1;
2356	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2357	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2358
2359	sk->sk_stamp = ktime_set(-1L, 0);
2360
2361#ifdef CONFIG_NET_RX_BUSY_POLL
2362	sk->sk_napi_id		=	0;
2363	sk->sk_ll_usec		=	sysctl_net_busy_read;
2364#endif
2365
2366	sk->sk_max_pacing_rate = ~0U;
2367	sk->sk_pacing_rate = ~0U;
2368	/*
2369	 * Before updating sk_refcnt, we must commit prior changes to memory
2370	 * (Documentation/RCU/rculist_nulls.txt for details)
2371	 */
2372	smp_wmb();
2373	atomic_set(&sk->sk_refcnt, 1);
2374	atomic_set(&sk->sk_drops, 0);
2375}
2376EXPORT_SYMBOL(sock_init_data);
2377
2378void lock_sock_nested(struct sock *sk, int subclass)
2379{
2380	might_sleep();
2381	spin_lock_bh(&sk->sk_lock.slock);
2382	if (sk->sk_lock.owned)
2383		__lock_sock(sk);
2384	sk->sk_lock.owned = 1;
2385	spin_unlock(&sk->sk_lock.slock);
2386	/*
2387	 * The sk_lock has mutex_lock() semantics here:
2388	 */
2389	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2390	local_bh_enable();
2391}
2392EXPORT_SYMBOL(lock_sock_nested);
2393
2394void release_sock(struct sock *sk)
2395{
2396	/*
2397	 * The sk_lock has mutex_unlock() semantics:
2398	 */
2399	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2400
2401	spin_lock_bh(&sk->sk_lock.slock);
2402	if (sk->sk_backlog.tail)
2403		__release_sock(sk);
2404
2405	/* Warning : release_cb() might need to release sk ownership,
2406	 * ie call sock_release_ownership(sk) before us.
2407	 */
2408	if (sk->sk_prot->release_cb)
2409		sk->sk_prot->release_cb(sk);
2410
2411	sock_release_ownership(sk);
2412	if (waitqueue_active(&sk->sk_lock.wq))
2413		wake_up(&sk->sk_lock.wq);
2414	spin_unlock_bh(&sk->sk_lock.slock);
2415}
2416EXPORT_SYMBOL(release_sock);
2417
2418/**
2419 * lock_sock_fast - fast version of lock_sock
2420 * @sk: socket
2421 *
2422 * This version should be used for very small section, where process wont block
2423 * return false if fast path is taken
2424 *   sk_lock.slock locked, owned = 0, BH disabled
2425 * return true if slow path is taken
2426 *   sk_lock.slock unlocked, owned = 1, BH enabled
2427 */
2428bool lock_sock_fast(struct sock *sk)
2429{
2430	might_sleep();
2431	spin_lock_bh(&sk->sk_lock.slock);
2432
2433	if (!sk->sk_lock.owned)
2434		/*
2435		 * Note : We must disable BH
2436		 */
2437		return false;
2438
2439	__lock_sock(sk);
2440	sk->sk_lock.owned = 1;
2441	spin_unlock(&sk->sk_lock.slock);
2442	/*
2443	 * The sk_lock has mutex_lock() semantics here:
2444	 */
2445	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2446	local_bh_enable();
2447	return true;
2448}
2449EXPORT_SYMBOL(lock_sock_fast);
2450
2451int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2452{
2453	struct timeval tv;
2454	if (!sock_flag(sk, SOCK_TIMESTAMP))
2455		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2456	tv = ktime_to_timeval(sk->sk_stamp);
2457	if (tv.tv_sec == -1)
2458		return -ENOENT;
2459	if (tv.tv_sec == 0) {
2460		sk->sk_stamp = ktime_get_real();
2461		tv = ktime_to_timeval(sk->sk_stamp);
2462	}
2463	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2464}
2465EXPORT_SYMBOL(sock_get_timestamp);
2466
2467int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2468{
2469	struct timespec ts;
2470	if (!sock_flag(sk, SOCK_TIMESTAMP))
2471		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2472	ts = ktime_to_timespec(sk->sk_stamp);
2473	if (ts.tv_sec == -1)
2474		return -ENOENT;
2475	if (ts.tv_sec == 0) {
2476		sk->sk_stamp = ktime_get_real();
2477		ts = ktime_to_timespec(sk->sk_stamp);
2478	}
2479	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2480}
2481EXPORT_SYMBOL(sock_get_timestampns);
2482
2483void sock_enable_timestamp(struct sock *sk, int flag)
2484{
2485	if (!sock_flag(sk, flag)) {
2486		unsigned long previous_flags = sk->sk_flags;
2487
2488		sock_set_flag(sk, flag);
2489		/*
2490		 * we just set one of the two flags which require net
2491		 * time stamping, but time stamping might have been on
2492		 * already because of the other one
2493		 */
2494		if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2495			net_enable_timestamp();
2496	}
2497}
2498
2499int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2500		       int level, int type)
2501{
2502	struct sock_exterr_skb *serr;
2503	struct sk_buff *skb, *skb2;
2504	int copied, err;
2505
2506	err = -EAGAIN;
2507	skb = skb_dequeue(&sk->sk_error_queue);
2508	if (skb == NULL)
2509		goto out;
2510
2511	copied = skb->len;
2512	if (copied > len) {
2513		msg->msg_flags |= MSG_TRUNC;
2514		copied = len;
2515	}
2516	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
2517	if (err)
2518		goto out_free_skb;
2519
2520	sock_recv_timestamp(msg, sk, skb);
2521
2522	serr = SKB_EXT_ERR(skb);
2523	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2524
2525	msg->msg_flags |= MSG_ERRQUEUE;
2526	err = copied;
2527
2528	/* Reset and regenerate socket error */
2529	spin_lock_bh(&sk->sk_error_queue.lock);
2530	sk->sk_err = 0;
2531	if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
2532		sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
2533		spin_unlock_bh(&sk->sk_error_queue.lock);
2534		sk->sk_error_report(sk);
2535	} else
2536		spin_unlock_bh(&sk->sk_error_queue.lock);
2537
2538out_free_skb:
2539	kfree_skb(skb);
2540out:
2541	return err;
2542}
2543EXPORT_SYMBOL(sock_recv_errqueue);
2544
2545/*
2546 *	Get a socket option on an socket.
2547 *
2548 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2549 *	asynchronous errors should be reported by getsockopt. We assume
2550 *	this means if you specify SO_ERROR (otherwise whats the point of it).
2551 */
2552int sock_common_getsockopt(struct socket *sock, int level, int optname,
2553			   char __user *optval, int __user *optlen)
2554{
2555	struct sock *sk = sock->sk;
2556
2557	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2558}
2559EXPORT_SYMBOL(sock_common_getsockopt);
2560
2561#ifdef CONFIG_COMPAT
2562int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2563				  char __user *optval, int __user *optlen)
2564{
2565	struct sock *sk = sock->sk;
2566
2567	if (sk->sk_prot->compat_getsockopt != NULL)
2568		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2569						      optval, optlen);
2570	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2571}
2572EXPORT_SYMBOL(compat_sock_common_getsockopt);
2573#endif
2574
2575int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2576			struct msghdr *msg, size_t size, int flags)
2577{
2578	struct sock *sk = sock->sk;
2579	int addr_len = 0;
2580	int err;
2581
2582	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2583				   flags & ~MSG_DONTWAIT, &addr_len);
2584	if (err >= 0)
2585		msg->msg_namelen = addr_len;
2586	return err;
2587}
2588EXPORT_SYMBOL(sock_common_recvmsg);
2589
2590/*
2591 *	Set socket options on an inet socket.
2592 */
2593int sock_common_setsockopt(struct socket *sock, int level, int optname,
2594			   char __user *optval, unsigned int optlen)
2595{
2596	struct sock *sk = sock->sk;
2597
2598	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2599}
2600EXPORT_SYMBOL(sock_common_setsockopt);
2601
2602#ifdef CONFIG_COMPAT
2603int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2604				  char __user *optval, unsigned int optlen)
2605{
2606	struct sock *sk = sock->sk;
2607
2608	if (sk->sk_prot->compat_setsockopt != NULL)
2609		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2610						      optval, optlen);
2611	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2612}
2613EXPORT_SYMBOL(compat_sock_common_setsockopt);
2614#endif
2615
2616void sk_common_release(struct sock *sk)
2617{
2618	if (sk->sk_prot->destroy)
2619		sk->sk_prot->destroy(sk);
2620
2621	/*
2622	 * Observation: when sock_common_release is called, processes have
2623	 * no access to socket. But net still has.
2624	 * Step one, detach it from networking:
2625	 *
2626	 * A. Remove from hash tables.
2627	 */
2628
2629	sk->sk_prot->unhash(sk);
2630
2631	/*
2632	 * In this point socket cannot receive new packets, but it is possible
2633	 * that some packets are in flight because some CPU runs receiver and
2634	 * did hash table lookup before we unhashed socket. They will achieve
2635	 * receive queue and will be purged by socket destructor.
2636	 *
2637	 * Also we still have packets pending on receive queue and probably,
2638	 * our own packets waiting in device queues. sock_destroy will drain
2639	 * receive queue, but transmitted packets will delay socket destruction
2640	 * until the last reference will be released.
2641	 */
2642
2643	sock_orphan(sk);
2644
2645	xfrm_sk_free_policy(sk);
2646
2647	sk_refcnt_debug_release(sk);
2648
2649	if (sk->sk_frag.page) {
2650		put_page(sk->sk_frag.page);
2651		sk->sk_frag.page = NULL;
2652	}
2653
2654	sock_put(sk);
2655}
2656EXPORT_SYMBOL(sk_common_release);
2657
2658#ifdef CONFIG_PROC_FS
2659#define PROTO_INUSE_NR	64	/* should be enough for the first time */
2660struct prot_inuse {
2661	int val[PROTO_INUSE_NR];
2662};
2663
2664static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2665
2666#ifdef CONFIG_NET_NS
2667void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2668{
2669	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2670}
2671EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2672
2673int sock_prot_inuse_get(struct net *net, struct proto *prot)
2674{
2675	int cpu, idx = prot->inuse_idx;
2676	int res = 0;
2677
2678	for_each_possible_cpu(cpu)
2679		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2680
2681	return res >= 0 ? res : 0;
2682}
2683EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2684
2685static int __net_init sock_inuse_init_net(struct net *net)
2686{
2687	net->core.inuse = alloc_percpu(struct prot_inuse);
2688	return net->core.inuse ? 0 : -ENOMEM;
2689}
2690
2691static void __net_exit sock_inuse_exit_net(struct net *net)
2692{
2693	free_percpu(net->core.inuse);
2694}
2695
2696static struct pernet_operations net_inuse_ops = {
2697	.init = sock_inuse_init_net,
2698	.exit = sock_inuse_exit_net,
2699};
2700
2701static __init int net_inuse_init(void)
2702{
2703	if (register_pernet_subsys(&net_inuse_ops))
2704		panic("Cannot initialize net inuse counters");
2705
2706	return 0;
2707}
2708
2709core_initcall(net_inuse_init);
2710#else
2711static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2712
2713void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2714{
2715	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2716}
2717EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2718
2719int sock_prot_inuse_get(struct net *net, struct proto *prot)
2720{
2721	int cpu, idx = prot->inuse_idx;
2722	int res = 0;
2723
2724	for_each_possible_cpu(cpu)
2725		res += per_cpu(prot_inuse, cpu).val[idx];
2726
2727	return res >= 0 ? res : 0;
2728}
2729EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2730#endif
2731
2732static void assign_proto_idx(struct proto *prot)
2733{
2734	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2735
2736	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2737		pr_err("PROTO_INUSE_NR exhausted\n");
2738		return;
2739	}
2740
2741	set_bit(prot->inuse_idx, proto_inuse_idx);
2742}
2743
2744static void release_proto_idx(struct proto *prot)
2745{
2746	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2747		clear_bit(prot->inuse_idx, proto_inuse_idx);
2748}
2749#else
2750static inline void assign_proto_idx(struct proto *prot)
2751{
2752}
2753
2754static inline void release_proto_idx(struct proto *prot)
2755{
2756}
2757#endif
2758
2759int proto_register(struct proto *prot, int alloc_slab)
2760{
2761	if (alloc_slab) {
2762		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2763					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2764					NULL);
2765
2766		if (prot->slab == NULL) {
2767			pr_crit("%s: Can't create sock SLAB cache!\n",
2768				prot->name);
2769			goto out;
2770		}
2771
2772		if (prot->rsk_prot != NULL) {
2773			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2774			if (prot->rsk_prot->slab_name == NULL)
2775				goto out_free_sock_slab;
2776
2777			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2778								 prot->rsk_prot->obj_size, 0,
2779								 SLAB_HWCACHE_ALIGN, NULL);
2780
2781			if (prot->rsk_prot->slab == NULL) {
2782				pr_crit("%s: Can't create request sock SLAB cache!\n",
2783					prot->name);
2784				goto out_free_request_sock_slab_name;
2785			}
2786		}
2787
2788		if (prot->twsk_prot != NULL) {
2789			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2790
2791			if (prot->twsk_prot->twsk_slab_name == NULL)
2792				goto out_free_request_sock_slab;
2793
2794			prot->twsk_prot->twsk_slab =
2795				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2796						  prot->twsk_prot->twsk_obj_size,
2797						  0,
2798						  SLAB_HWCACHE_ALIGN |
2799							prot->slab_flags,
2800						  NULL);
2801			if (prot->twsk_prot->twsk_slab == NULL)
2802				goto out_free_timewait_sock_slab_name;
2803		}
2804	}
2805
2806	mutex_lock(&proto_list_mutex);
2807	list_add(&prot->node, &proto_list);
2808	assign_proto_idx(prot);
2809	mutex_unlock(&proto_list_mutex);
2810	return 0;
2811
2812out_free_timewait_sock_slab_name:
2813	kfree(prot->twsk_prot->twsk_slab_name);
2814out_free_request_sock_slab:
2815	if (prot->rsk_prot && prot->rsk_prot->slab) {
2816		kmem_cache_destroy(prot->rsk_prot->slab);
2817		prot->rsk_prot->slab = NULL;
2818	}
2819out_free_request_sock_slab_name:
2820	if (prot->rsk_prot)
2821		kfree(prot->rsk_prot->slab_name);
2822out_free_sock_slab:
2823	kmem_cache_destroy(prot->slab);
2824	prot->slab = NULL;
2825out:
2826	return -ENOBUFS;
2827}
2828EXPORT_SYMBOL(proto_register);
2829
2830void proto_unregister(struct proto *prot)
2831{
2832	mutex_lock(&proto_list_mutex);
2833	release_proto_idx(prot);
2834	list_del(&prot->node);
2835	mutex_unlock(&proto_list_mutex);
2836
2837	if (prot->slab != NULL) {
2838		kmem_cache_destroy(prot->slab);
2839		prot->slab = NULL;
2840	}
2841
2842	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2843		kmem_cache_destroy(prot->rsk_prot->slab);
2844		kfree(prot->rsk_prot->slab_name);
2845		prot->rsk_prot->slab = NULL;
2846	}
2847
2848	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2849		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2850		kfree(prot->twsk_prot->twsk_slab_name);
2851		prot->twsk_prot->twsk_slab = NULL;
2852	}
2853}
2854EXPORT_SYMBOL(proto_unregister);
2855
2856#ifdef CONFIG_PROC_FS
2857static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2858	__acquires(proto_list_mutex)
2859{
2860	mutex_lock(&proto_list_mutex);
2861	return seq_list_start_head(&proto_list, *pos);
2862}
2863
2864static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2865{
2866	return seq_list_next(v, &proto_list, pos);
2867}
2868
2869static void proto_seq_stop(struct seq_file *seq, void *v)
2870	__releases(proto_list_mutex)
2871{
2872	mutex_unlock(&proto_list_mutex);
2873}
2874
2875static char proto_method_implemented(const void *method)
2876{
2877	return method == NULL ? 'n' : 'y';
2878}
2879static long sock_prot_memory_allocated(struct proto *proto)
2880{
2881	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2882}
2883
2884static char *sock_prot_memory_pressure(struct proto *proto)
2885{
2886	return proto->memory_pressure != NULL ?
2887	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2888}
2889
2890static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2891{
2892
2893	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2894			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2895		   proto->name,
2896		   proto->obj_size,
2897		   sock_prot_inuse_get(seq_file_net(seq), proto),
2898		   sock_prot_memory_allocated(proto),
2899		   sock_prot_memory_pressure(proto),
2900		   proto->max_header,
2901		   proto->slab == NULL ? "no" : "yes",
2902		   module_name(proto->owner),
2903		   proto_method_implemented(proto->close),
2904		   proto_method_implemented(proto->connect),
2905		   proto_method_implemented(proto->disconnect),
2906		   proto_method_implemented(proto->accept),
2907		   proto_method_implemented(proto->ioctl),
2908		   proto_method_implemented(proto->init),
2909		   proto_method_implemented(proto->destroy),
2910		   proto_method_implemented(proto->shutdown),
2911		   proto_method_implemented(proto->setsockopt),
2912		   proto_method_implemented(proto->getsockopt),
2913		   proto_method_implemented(proto->sendmsg),
2914		   proto_method_implemented(proto->recvmsg),
2915		   proto_method_implemented(proto->sendpage),
2916		   proto_method_implemented(proto->bind),
2917		   proto_method_implemented(proto->backlog_rcv),
2918		   proto_method_implemented(proto->hash),
2919		   proto_method_implemented(proto->unhash),
2920		   proto_method_implemented(proto->get_port),
2921		   proto_method_implemented(proto->enter_memory_pressure));
2922}
2923
2924static int proto_seq_show(struct seq_file *seq, void *v)
2925{
2926	if (v == &proto_list)
2927		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2928			   "protocol",
2929			   "size",
2930			   "sockets",
2931			   "memory",
2932			   "press",
2933			   "maxhdr",
2934			   "slab",
2935			   "module",
2936			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2937	else
2938		proto_seq_printf(seq, list_entry(v, struct proto, node));
2939	return 0;
2940}
2941
2942static const struct seq_operations proto_seq_ops = {
2943	.start  = proto_seq_start,
2944	.next   = proto_seq_next,
2945	.stop   = proto_seq_stop,
2946	.show   = proto_seq_show,
2947};
2948
2949static int proto_seq_open(struct inode *inode, struct file *file)
2950{
2951	return seq_open_net(inode, file, &proto_seq_ops,
2952			    sizeof(struct seq_net_private));
2953}
2954
2955static const struct file_operations proto_seq_fops = {
2956	.owner		= THIS_MODULE,
2957	.open		= proto_seq_open,
2958	.read		= seq_read,
2959	.llseek		= seq_lseek,
2960	.release	= seq_release_net,
2961};
2962
2963static __net_init int proto_init_net(struct net *net)
2964{
2965	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2966		return -ENOMEM;
2967
2968	return 0;
2969}
2970
2971static __net_exit void proto_exit_net(struct net *net)
2972{
2973	remove_proc_entry("protocols", net->proc_net);
2974}
2975
2976
2977static __net_initdata struct pernet_operations proto_net_ops = {
2978	.init = proto_init_net,
2979	.exit = proto_exit_net,
2980};
2981
2982static int __init proto_init(void)
2983{
2984	return register_pernet_subsys(&proto_net_ops);
2985}
2986
2987subsys_initcall(proto_init);
2988
2989#endif /* PROC_FS */
2990