sock.c revision 6d8ebc8a27e1b187abfb06dd79b35a393aa9f2a2
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		Generic socket support routines. Memory allocators, socket lock/release
7 *		handler for protocols to use and generic option handler.
8 *
9 *
10 * Authors:	Ross Biro
11 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 *		Florian La Roche, <flla@stud.uni-sb.de>
13 *		Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 *		Alan Cox	: 	Numerous verify_area() problems
17 *		Alan Cox	:	Connecting on a connecting socket
18 *					now returns an error for tcp.
19 *		Alan Cox	:	sock->protocol is set correctly.
20 *					and is not sometimes left as 0.
21 *		Alan Cox	:	connect handles icmp errors on a
22 *					connect properly. Unfortunately there
23 *					is a restart syscall nasty there. I
24 *					can't match BSD without hacking the C
25 *					library. Ideas urgently sought!
26 *		Alan Cox	:	Disallow bind() to addresses that are
27 *					not ours - especially broadcast ones!!
28 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30 *					instead they leave that for the DESTROY timer.
31 *		Alan Cox	:	Clean up error flag in accept
32 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33 *					was buggy. Put a remove_sock() in the handler
34 *					for memory when we hit 0. Also altered the timer
35 *					code. The ACK stuff can wait and needs major
36 *					TCP layer surgery.
37 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38 *					and fixed timer/inet_bh race.
39 *		Alan Cox	:	Added zapped flag for TCP
40 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47 *	Pauline Middelink	:	identd support
48 *		Alan Cox	:	Fixed connect() taking signals I think.
49 *		Alan Cox	:	SO_LINGER supported
50 *		Alan Cox	:	Error reporting fixes
51 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52 *		Alan Cox	:	inet sockets don't set sk->type!
53 *		Alan Cox	:	Split socket option code
54 *		Alan Cox	:	Callbacks
55 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56 *		Alex		:	Removed restriction on inet fioctl
57 *		Alan Cox	:	Splitting INET from NET core
58 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60 *		Alan Cox	:	Split IP from generic code
61 *		Alan Cox	:	New kfree_skbmem()
62 *		Alan Cox	:	Make SO_DEBUG superuser only.
63 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64 *					(compatibility fix)
65 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66 *		Alan Cox	:	Allocator for a socket is settable.
67 *		Alan Cox	:	SO_ERROR includes soft errors.
68 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69 *		Alan Cox	: 	Generic socket allocation to make hooks
70 *					easier (suggested by Craig Metz).
71 *		Michael Pall	:	SO_ERROR returns positive errno again
72 *              Steve Whitehouse:       Added default destructor to free
73 *                                      protocol private data.
74 *              Steve Whitehouse:       Added various other default routines
75 *                                      common to several socket families.
76 *              Chris Evans     :       Call suser() check last on F_SETOWN
77 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79 *		Andi Kleen	:	Fix write_space callback
80 *		Chris Evans	:	Security fixes - signedness again
81 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 *		This program is free software; you can redistribute it and/or
87 *		modify it under the terms of the GNU General Public License
88 *		as published by the Free Software Foundation; either version
89 *		2 of the License, or (at your option) any later version.
90 */
91
92#include <linux/capability.h>
93#include <linux/errno.h>
94#include <linux/types.h>
95#include <linux/socket.h>
96#include <linux/in.h>
97#include <linux/kernel.h>
98#include <linux/module.h>
99#include <linux/proc_fs.h>
100#include <linux/seq_file.h>
101#include <linux/sched.h>
102#include <linux/timer.h>
103#include <linux/string.h>
104#include <linux/sockios.h>
105#include <linux/net.h>
106#include <linux/mm.h>
107#include <linux/slab.h>
108#include <linux/interrupt.h>
109#include <linux/poll.h>
110#include <linux/tcp.h>
111#include <linux/init.h>
112#include <linux/highmem.h>
113#include <linux/user_namespace.h>
114#include <linux/static_key.h>
115#include <linux/memcontrol.h>
116#include <linux/prefetch.h>
117
118#include <asm/uaccess.h>
119
120#include <linux/netdevice.h>
121#include <net/protocol.h>
122#include <linux/skbuff.h>
123#include <net/net_namespace.h>
124#include <net/request_sock.h>
125#include <net/sock.h>
126#include <linux/net_tstamp.h>
127#include <net/xfrm.h>
128#include <linux/ipsec.h>
129#include <net/cls_cgroup.h>
130#include <net/netprio_cgroup.h>
131
132#include <linux/filter.h>
133
134#include <trace/events/sock.h>
135
136#ifdef CONFIG_INET
137#include <net/tcp.h>
138#endif
139
140static DEFINE_MUTEX(proto_list_mutex);
141static LIST_HEAD(proto_list);
142
143#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
144int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
145{
146	struct proto *proto;
147	int ret = 0;
148
149	mutex_lock(&proto_list_mutex);
150	list_for_each_entry(proto, &proto_list, node) {
151		if (proto->init_cgroup) {
152			ret = proto->init_cgroup(cgrp, ss);
153			if (ret)
154				goto out;
155		}
156	}
157
158	mutex_unlock(&proto_list_mutex);
159	return ret;
160out:
161	list_for_each_entry_continue_reverse(proto, &proto_list, node)
162		if (proto->destroy_cgroup)
163			proto->destroy_cgroup(cgrp);
164	mutex_unlock(&proto_list_mutex);
165	return ret;
166}
167
168void mem_cgroup_sockets_destroy(struct cgroup *cgrp)
169{
170	struct proto *proto;
171
172	mutex_lock(&proto_list_mutex);
173	list_for_each_entry_reverse(proto, &proto_list, node)
174		if (proto->destroy_cgroup)
175			proto->destroy_cgroup(cgrp);
176	mutex_unlock(&proto_list_mutex);
177}
178#endif
179
180/*
181 * Each address family might have different locking rules, so we have
182 * one slock key per address family:
183 */
184static struct lock_class_key af_family_keys[AF_MAX];
185static struct lock_class_key af_family_slock_keys[AF_MAX];
186
187struct static_key memcg_socket_limit_enabled;
188EXPORT_SYMBOL(memcg_socket_limit_enabled);
189
190/*
191 * Make lock validator output more readable. (we pre-construct these
192 * strings build-time, so that runtime initialization of socket
193 * locks is fast):
194 */
195static const char *const af_family_key_strings[AF_MAX+1] = {
196  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
197  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
198  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
199  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
200  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
201  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
202  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
203  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
204  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
205  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
206  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
207  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
208  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
209  "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
210};
211static const char *const af_family_slock_key_strings[AF_MAX+1] = {
212  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
213  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
214  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
215  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
216  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
217  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
218  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
219  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
220  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
221  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
222  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
223  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
224  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
225  "slock-AF_NFC"   , "slock-AF_MAX"
226};
227static const char *const af_family_clock_key_strings[AF_MAX+1] = {
228  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
229  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
230  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
231  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
232  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
233  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
234  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
235  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
236  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
237  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
238  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
239  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
240  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
241  "clock-AF_NFC"   , "clock-AF_MAX"
242};
243
244/*
245 * sk_callback_lock locking rules are per-address-family,
246 * so split the lock classes by using a per-AF key:
247 */
248static struct lock_class_key af_callback_keys[AF_MAX];
249
250/* Take into consideration the size of the struct sk_buff overhead in the
251 * determination of these values, since that is non-constant across
252 * platforms.  This makes socket queueing behavior and performance
253 * not depend upon such differences.
254 */
255#define _SK_MEM_PACKETS		256
256#define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
257#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
258#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
259
260/* Run time adjustable parameters. */
261__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
262EXPORT_SYMBOL(sysctl_wmem_max);
263__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
264EXPORT_SYMBOL(sysctl_rmem_max);
265__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
266__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
267
268/* Maximal space eaten by iovec or ancillary data plus some space */
269int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
270EXPORT_SYMBOL(sysctl_optmem_max);
271
272#if defined(CONFIG_CGROUPS)
273#if !defined(CONFIG_NET_CLS_CGROUP)
274int net_cls_subsys_id = -1;
275EXPORT_SYMBOL_GPL(net_cls_subsys_id);
276#endif
277#if !defined(CONFIG_NETPRIO_CGROUP)
278int net_prio_subsys_id = -1;
279EXPORT_SYMBOL_GPL(net_prio_subsys_id);
280#endif
281#endif
282
283static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
284{
285	struct timeval tv;
286
287	if (optlen < sizeof(tv))
288		return -EINVAL;
289	if (copy_from_user(&tv, optval, sizeof(tv)))
290		return -EFAULT;
291	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
292		return -EDOM;
293
294	if (tv.tv_sec < 0) {
295		static int warned __read_mostly;
296
297		*timeo_p = 0;
298		if (warned < 10 && net_ratelimit()) {
299			warned++;
300			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
301			       "tries to set negative timeout\n",
302				current->comm, task_pid_nr(current));
303		}
304		return 0;
305	}
306	*timeo_p = MAX_SCHEDULE_TIMEOUT;
307	if (tv.tv_sec == 0 && tv.tv_usec == 0)
308		return 0;
309	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
310		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
311	return 0;
312}
313
314static void sock_warn_obsolete_bsdism(const char *name)
315{
316	static int warned;
317	static char warncomm[TASK_COMM_LEN];
318	if (strcmp(warncomm, current->comm) && warned < 5) {
319		strcpy(warncomm,  current->comm);
320		printk(KERN_WARNING "process `%s' is using obsolete "
321		       "%s SO_BSDCOMPAT\n", warncomm, name);
322		warned++;
323	}
324}
325
326#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
327
328static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
329{
330	if (sk->sk_flags & flags) {
331		sk->sk_flags &= ~flags;
332		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
333			net_disable_timestamp();
334	}
335}
336
337
338int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
339{
340	int err;
341	int skb_len;
342	unsigned long flags;
343	struct sk_buff_head *list = &sk->sk_receive_queue;
344
345	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
346		atomic_inc(&sk->sk_drops);
347		trace_sock_rcvqueue_full(sk, skb);
348		return -ENOMEM;
349	}
350
351	err = sk_filter(sk, skb);
352	if (err)
353		return err;
354
355	if (!sk_rmem_schedule(sk, skb->truesize)) {
356		atomic_inc(&sk->sk_drops);
357		return -ENOBUFS;
358	}
359
360	skb->dev = NULL;
361	skb_set_owner_r(skb, sk);
362
363	/* Cache the SKB length before we tack it onto the receive
364	 * queue.  Once it is added it no longer belongs to us and
365	 * may be freed by other threads of control pulling packets
366	 * from the queue.
367	 */
368	skb_len = skb->len;
369
370	/* we escape from rcu protected region, make sure we dont leak
371	 * a norefcounted dst
372	 */
373	skb_dst_force(skb);
374
375	spin_lock_irqsave(&list->lock, flags);
376	skb->dropcount = atomic_read(&sk->sk_drops);
377	__skb_queue_tail(list, skb);
378	spin_unlock_irqrestore(&list->lock, flags);
379
380	if (!sock_flag(sk, SOCK_DEAD))
381		sk->sk_data_ready(sk, skb_len);
382	return 0;
383}
384EXPORT_SYMBOL(sock_queue_rcv_skb);
385
386int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
387{
388	int rc = NET_RX_SUCCESS;
389
390	if (sk_filter(sk, skb))
391		goto discard_and_relse;
392
393	skb->dev = NULL;
394
395	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
396		atomic_inc(&sk->sk_drops);
397		goto discard_and_relse;
398	}
399	if (nested)
400		bh_lock_sock_nested(sk);
401	else
402		bh_lock_sock(sk);
403	if (!sock_owned_by_user(sk)) {
404		/*
405		 * trylock + unlock semantics:
406		 */
407		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
408
409		rc = sk_backlog_rcv(sk, skb);
410
411		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
412	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
413		bh_unlock_sock(sk);
414		atomic_inc(&sk->sk_drops);
415		goto discard_and_relse;
416	}
417
418	bh_unlock_sock(sk);
419out:
420	sock_put(sk);
421	return rc;
422discard_and_relse:
423	kfree_skb(skb);
424	goto out;
425}
426EXPORT_SYMBOL(sk_receive_skb);
427
428void sk_reset_txq(struct sock *sk)
429{
430	sk_tx_queue_clear(sk);
431}
432EXPORT_SYMBOL(sk_reset_txq);
433
434struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
435{
436	struct dst_entry *dst = __sk_dst_get(sk);
437
438	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
439		sk_tx_queue_clear(sk);
440		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
441		dst_release(dst);
442		return NULL;
443	}
444
445	return dst;
446}
447EXPORT_SYMBOL(__sk_dst_check);
448
449struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
450{
451	struct dst_entry *dst = sk_dst_get(sk);
452
453	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
454		sk_dst_reset(sk);
455		dst_release(dst);
456		return NULL;
457	}
458
459	return dst;
460}
461EXPORT_SYMBOL(sk_dst_check);
462
463static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
464{
465	int ret = -ENOPROTOOPT;
466#ifdef CONFIG_NETDEVICES
467	struct net *net = sock_net(sk);
468	char devname[IFNAMSIZ];
469	int index;
470
471	/* Sorry... */
472	ret = -EPERM;
473	if (!capable(CAP_NET_RAW))
474		goto out;
475
476	ret = -EINVAL;
477	if (optlen < 0)
478		goto out;
479
480	/* Bind this socket to a particular device like "eth0",
481	 * as specified in the passed interface name. If the
482	 * name is "" or the option length is zero the socket
483	 * is not bound.
484	 */
485	if (optlen > IFNAMSIZ - 1)
486		optlen = IFNAMSIZ - 1;
487	memset(devname, 0, sizeof(devname));
488
489	ret = -EFAULT;
490	if (copy_from_user(devname, optval, optlen))
491		goto out;
492
493	index = 0;
494	if (devname[0] != '\0') {
495		struct net_device *dev;
496
497		rcu_read_lock();
498		dev = dev_get_by_name_rcu(net, devname);
499		if (dev)
500			index = dev->ifindex;
501		rcu_read_unlock();
502		ret = -ENODEV;
503		if (!dev)
504			goto out;
505	}
506
507	lock_sock(sk);
508	sk->sk_bound_dev_if = index;
509	sk_dst_reset(sk);
510	release_sock(sk);
511
512	ret = 0;
513
514out:
515#endif
516
517	return ret;
518}
519
520static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
521{
522	if (valbool)
523		sock_set_flag(sk, bit);
524	else
525		sock_reset_flag(sk, bit);
526}
527
528/*
529 *	This is meant for all protocols to use and covers goings on
530 *	at the socket level. Everything here is generic.
531 */
532
533int sock_setsockopt(struct socket *sock, int level, int optname,
534		    char __user *optval, unsigned int optlen)
535{
536	struct sock *sk = sock->sk;
537	int val;
538	int valbool;
539	struct linger ling;
540	int ret = 0;
541
542	/*
543	 *	Options without arguments
544	 */
545
546	if (optname == SO_BINDTODEVICE)
547		return sock_bindtodevice(sk, optval, optlen);
548
549	if (optlen < sizeof(int))
550		return -EINVAL;
551
552	if (get_user(val, (int __user *)optval))
553		return -EFAULT;
554
555	valbool = val ? 1 : 0;
556
557	lock_sock(sk);
558
559	switch (optname) {
560	case SO_DEBUG:
561		if (val && !capable(CAP_NET_ADMIN))
562			ret = -EACCES;
563		else
564			sock_valbool_flag(sk, SOCK_DBG, valbool);
565		break;
566	case SO_REUSEADDR:
567		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
568		break;
569	case SO_TYPE:
570	case SO_PROTOCOL:
571	case SO_DOMAIN:
572	case SO_ERROR:
573		ret = -ENOPROTOOPT;
574		break;
575	case SO_DONTROUTE:
576		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
577		break;
578	case SO_BROADCAST:
579		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
580		break;
581	case SO_SNDBUF:
582		/* Don't error on this BSD doesn't and if you think
583		 * about it this is right. Otherwise apps have to
584		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
585		 * are treated in BSD as hints
586		 */
587		val = min_t(u32, val, sysctl_wmem_max);
588set_sndbuf:
589		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
590		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
591		/* Wake up sending tasks if we upped the value. */
592		sk->sk_write_space(sk);
593		break;
594
595	case SO_SNDBUFFORCE:
596		if (!capable(CAP_NET_ADMIN)) {
597			ret = -EPERM;
598			break;
599		}
600		goto set_sndbuf;
601
602	case SO_RCVBUF:
603		/* Don't error on this BSD doesn't and if you think
604		 * about it this is right. Otherwise apps have to
605		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
606		 * are treated in BSD as hints
607		 */
608		val = min_t(u32, val, sysctl_rmem_max);
609set_rcvbuf:
610		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
611		/*
612		 * We double it on the way in to account for
613		 * "struct sk_buff" etc. overhead.   Applications
614		 * assume that the SO_RCVBUF setting they make will
615		 * allow that much actual data to be received on that
616		 * socket.
617		 *
618		 * Applications are unaware that "struct sk_buff" and
619		 * other overheads allocate from the receive buffer
620		 * during socket buffer allocation.
621		 *
622		 * And after considering the possible alternatives,
623		 * returning the value we actually used in getsockopt
624		 * is the most desirable behavior.
625		 */
626		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
627		break;
628
629	case SO_RCVBUFFORCE:
630		if (!capable(CAP_NET_ADMIN)) {
631			ret = -EPERM;
632			break;
633		}
634		goto set_rcvbuf;
635
636	case SO_KEEPALIVE:
637#ifdef CONFIG_INET
638		if (sk->sk_protocol == IPPROTO_TCP)
639			tcp_set_keepalive(sk, valbool);
640#endif
641		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
642		break;
643
644	case SO_OOBINLINE:
645		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
646		break;
647
648	case SO_NO_CHECK:
649		sk->sk_no_check = valbool;
650		break;
651
652	case SO_PRIORITY:
653		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
654			sk->sk_priority = val;
655		else
656			ret = -EPERM;
657		break;
658
659	case SO_LINGER:
660		if (optlen < sizeof(ling)) {
661			ret = -EINVAL;	/* 1003.1g */
662			break;
663		}
664		if (copy_from_user(&ling, optval, sizeof(ling))) {
665			ret = -EFAULT;
666			break;
667		}
668		if (!ling.l_onoff)
669			sock_reset_flag(sk, SOCK_LINGER);
670		else {
671#if (BITS_PER_LONG == 32)
672			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
673				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
674			else
675#endif
676				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
677			sock_set_flag(sk, SOCK_LINGER);
678		}
679		break;
680
681	case SO_BSDCOMPAT:
682		sock_warn_obsolete_bsdism("setsockopt");
683		break;
684
685	case SO_PASSCRED:
686		if (valbool)
687			set_bit(SOCK_PASSCRED, &sock->flags);
688		else
689			clear_bit(SOCK_PASSCRED, &sock->flags);
690		break;
691
692	case SO_TIMESTAMP:
693	case SO_TIMESTAMPNS:
694		if (valbool)  {
695			if (optname == SO_TIMESTAMP)
696				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
697			else
698				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
699			sock_set_flag(sk, SOCK_RCVTSTAMP);
700			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
701		} else {
702			sock_reset_flag(sk, SOCK_RCVTSTAMP);
703			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
704		}
705		break;
706
707	case SO_TIMESTAMPING:
708		if (val & ~SOF_TIMESTAMPING_MASK) {
709			ret = -EINVAL;
710			break;
711		}
712		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
713				  val & SOF_TIMESTAMPING_TX_HARDWARE);
714		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
715				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
716		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
717				  val & SOF_TIMESTAMPING_RX_HARDWARE);
718		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
719			sock_enable_timestamp(sk,
720					      SOCK_TIMESTAMPING_RX_SOFTWARE);
721		else
722			sock_disable_timestamp(sk,
723					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
724		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
725				  val & SOF_TIMESTAMPING_SOFTWARE);
726		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
727				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
728		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
729				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
730		break;
731
732	case SO_RCVLOWAT:
733		if (val < 0)
734			val = INT_MAX;
735		sk->sk_rcvlowat = val ? : 1;
736		break;
737
738	case SO_RCVTIMEO:
739		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
740		break;
741
742	case SO_SNDTIMEO:
743		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
744		break;
745
746	case SO_ATTACH_FILTER:
747		ret = -EINVAL;
748		if (optlen == sizeof(struct sock_fprog)) {
749			struct sock_fprog fprog;
750
751			ret = -EFAULT;
752			if (copy_from_user(&fprog, optval, sizeof(fprog)))
753				break;
754
755			ret = sk_attach_filter(&fprog, sk);
756		}
757		break;
758
759	case SO_DETACH_FILTER:
760		ret = sk_detach_filter(sk);
761		break;
762
763	case SO_PASSSEC:
764		if (valbool)
765			set_bit(SOCK_PASSSEC, &sock->flags);
766		else
767			clear_bit(SOCK_PASSSEC, &sock->flags);
768		break;
769	case SO_MARK:
770		if (!capable(CAP_NET_ADMIN))
771			ret = -EPERM;
772		else
773			sk->sk_mark = val;
774		break;
775
776		/* We implement the SO_SNDLOWAT etc to
777		   not be settable (1003.1g 5.3) */
778	case SO_RXQ_OVFL:
779		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
780		break;
781
782	case SO_WIFI_STATUS:
783		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
784		break;
785
786	case SO_PEEK_OFF:
787		if (sock->ops->set_peek_off)
788			sock->ops->set_peek_off(sk, val);
789		else
790			ret = -EOPNOTSUPP;
791		break;
792
793	case SO_NOFCS:
794		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
795		break;
796
797	default:
798		ret = -ENOPROTOOPT;
799		break;
800	}
801	release_sock(sk);
802	return ret;
803}
804EXPORT_SYMBOL(sock_setsockopt);
805
806
807void cred_to_ucred(struct pid *pid, const struct cred *cred,
808		   struct ucred *ucred)
809{
810	ucred->pid = pid_vnr(pid);
811	ucred->uid = ucred->gid = -1;
812	if (cred) {
813		struct user_namespace *current_ns = current_user_ns();
814
815		ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
816		ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
817	}
818}
819EXPORT_SYMBOL_GPL(cred_to_ucred);
820
821int sock_getsockopt(struct socket *sock, int level, int optname,
822		    char __user *optval, int __user *optlen)
823{
824	struct sock *sk = sock->sk;
825
826	union {
827		int val;
828		struct linger ling;
829		struct timeval tm;
830	} v;
831
832	int lv = sizeof(int);
833	int len;
834
835	if (get_user(len, optlen))
836		return -EFAULT;
837	if (len < 0)
838		return -EINVAL;
839
840	memset(&v, 0, sizeof(v));
841
842	switch (optname) {
843	case SO_DEBUG:
844		v.val = sock_flag(sk, SOCK_DBG);
845		break;
846
847	case SO_DONTROUTE:
848		v.val = sock_flag(sk, SOCK_LOCALROUTE);
849		break;
850
851	case SO_BROADCAST:
852		v.val = !!sock_flag(sk, SOCK_BROADCAST);
853		break;
854
855	case SO_SNDBUF:
856		v.val = sk->sk_sndbuf;
857		break;
858
859	case SO_RCVBUF:
860		v.val = sk->sk_rcvbuf;
861		break;
862
863	case SO_REUSEADDR:
864		v.val = sk->sk_reuse;
865		break;
866
867	case SO_KEEPALIVE:
868		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
869		break;
870
871	case SO_TYPE:
872		v.val = sk->sk_type;
873		break;
874
875	case SO_PROTOCOL:
876		v.val = sk->sk_protocol;
877		break;
878
879	case SO_DOMAIN:
880		v.val = sk->sk_family;
881		break;
882
883	case SO_ERROR:
884		v.val = -sock_error(sk);
885		if (v.val == 0)
886			v.val = xchg(&sk->sk_err_soft, 0);
887		break;
888
889	case SO_OOBINLINE:
890		v.val = !!sock_flag(sk, SOCK_URGINLINE);
891		break;
892
893	case SO_NO_CHECK:
894		v.val = sk->sk_no_check;
895		break;
896
897	case SO_PRIORITY:
898		v.val = sk->sk_priority;
899		break;
900
901	case SO_LINGER:
902		lv		= sizeof(v.ling);
903		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
904		v.ling.l_linger	= sk->sk_lingertime / HZ;
905		break;
906
907	case SO_BSDCOMPAT:
908		sock_warn_obsolete_bsdism("getsockopt");
909		break;
910
911	case SO_TIMESTAMP:
912		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
913				!sock_flag(sk, SOCK_RCVTSTAMPNS);
914		break;
915
916	case SO_TIMESTAMPNS:
917		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
918		break;
919
920	case SO_TIMESTAMPING:
921		v.val = 0;
922		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
923			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
924		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
925			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
926		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
927			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
928		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
929			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
930		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
931			v.val |= SOF_TIMESTAMPING_SOFTWARE;
932		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
933			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
934		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
935			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
936		break;
937
938	case SO_RCVTIMEO:
939		lv = sizeof(struct timeval);
940		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
941			v.tm.tv_sec = 0;
942			v.tm.tv_usec = 0;
943		} else {
944			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
945			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
946		}
947		break;
948
949	case SO_SNDTIMEO:
950		lv = sizeof(struct timeval);
951		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
952			v.tm.tv_sec = 0;
953			v.tm.tv_usec = 0;
954		} else {
955			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
956			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
957		}
958		break;
959
960	case SO_RCVLOWAT:
961		v.val = sk->sk_rcvlowat;
962		break;
963
964	case SO_SNDLOWAT:
965		v.val = 1;
966		break;
967
968	case SO_PASSCRED:
969		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
970		break;
971
972	case SO_PEERCRED:
973	{
974		struct ucred peercred;
975		if (len > sizeof(peercred))
976			len = sizeof(peercred);
977		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
978		if (copy_to_user(optval, &peercred, len))
979			return -EFAULT;
980		goto lenout;
981	}
982
983	case SO_PEERNAME:
984	{
985		char address[128];
986
987		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
988			return -ENOTCONN;
989		if (lv < len)
990			return -EINVAL;
991		if (copy_to_user(optval, address, len))
992			return -EFAULT;
993		goto lenout;
994	}
995
996	/* Dubious BSD thing... Probably nobody even uses it, but
997	 * the UNIX standard wants it for whatever reason... -DaveM
998	 */
999	case SO_ACCEPTCONN:
1000		v.val = sk->sk_state == TCP_LISTEN;
1001		break;
1002
1003	case SO_PASSSEC:
1004		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1005		break;
1006
1007	case SO_PEERSEC:
1008		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1009
1010	case SO_MARK:
1011		v.val = sk->sk_mark;
1012		break;
1013
1014	case SO_RXQ_OVFL:
1015		v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
1016		break;
1017
1018	case SO_WIFI_STATUS:
1019		v.val = !!sock_flag(sk, SOCK_WIFI_STATUS);
1020		break;
1021
1022	case SO_PEEK_OFF:
1023		if (!sock->ops->set_peek_off)
1024			return -EOPNOTSUPP;
1025
1026		v.val = sk->sk_peek_off;
1027		break;
1028	case SO_NOFCS:
1029		v.val = !!sock_flag(sk, SOCK_NOFCS);
1030		break;
1031	default:
1032		return -ENOPROTOOPT;
1033	}
1034
1035	if (len > lv)
1036		len = lv;
1037	if (copy_to_user(optval, &v, len))
1038		return -EFAULT;
1039lenout:
1040	if (put_user(len, optlen))
1041		return -EFAULT;
1042	return 0;
1043}
1044
1045/*
1046 * Initialize an sk_lock.
1047 *
1048 * (We also register the sk_lock with the lock validator.)
1049 */
1050static inline void sock_lock_init(struct sock *sk)
1051{
1052	sock_lock_init_class_and_name(sk,
1053			af_family_slock_key_strings[sk->sk_family],
1054			af_family_slock_keys + sk->sk_family,
1055			af_family_key_strings[sk->sk_family],
1056			af_family_keys + sk->sk_family);
1057}
1058
1059/*
1060 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1061 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1062 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1063 */
1064static void sock_copy(struct sock *nsk, const struct sock *osk)
1065{
1066#ifdef CONFIG_SECURITY_NETWORK
1067	void *sptr = nsk->sk_security;
1068#endif
1069	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1070
1071	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1072	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1073
1074#ifdef CONFIG_SECURITY_NETWORK
1075	nsk->sk_security = sptr;
1076	security_sk_clone(osk, nsk);
1077#endif
1078}
1079
1080/*
1081 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1082 * un-modified. Special care is taken when initializing object to zero.
1083 */
1084static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1085{
1086	if (offsetof(struct sock, sk_node.next) != 0)
1087		memset(sk, 0, offsetof(struct sock, sk_node.next));
1088	memset(&sk->sk_node.pprev, 0,
1089	       size - offsetof(struct sock, sk_node.pprev));
1090}
1091
1092void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1093{
1094	unsigned long nulls1, nulls2;
1095
1096	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1097	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1098	if (nulls1 > nulls2)
1099		swap(nulls1, nulls2);
1100
1101	if (nulls1 != 0)
1102		memset((char *)sk, 0, nulls1);
1103	memset((char *)sk + nulls1 + sizeof(void *), 0,
1104	       nulls2 - nulls1 - sizeof(void *));
1105	memset((char *)sk + nulls2 + sizeof(void *), 0,
1106	       size - nulls2 - sizeof(void *));
1107}
1108EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1109
1110static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1111		int family)
1112{
1113	struct sock *sk;
1114	struct kmem_cache *slab;
1115
1116	slab = prot->slab;
1117	if (slab != NULL) {
1118		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1119		if (!sk)
1120			return sk;
1121		if (priority & __GFP_ZERO) {
1122			if (prot->clear_sk)
1123				prot->clear_sk(sk, prot->obj_size);
1124			else
1125				sk_prot_clear_nulls(sk, prot->obj_size);
1126		}
1127	} else
1128		sk = kmalloc(prot->obj_size, priority);
1129
1130	if (sk != NULL) {
1131		kmemcheck_annotate_bitfield(sk, flags);
1132
1133		if (security_sk_alloc(sk, family, priority))
1134			goto out_free;
1135
1136		if (!try_module_get(prot->owner))
1137			goto out_free_sec;
1138		sk_tx_queue_clear(sk);
1139	}
1140
1141	return sk;
1142
1143out_free_sec:
1144	security_sk_free(sk);
1145out_free:
1146	if (slab != NULL)
1147		kmem_cache_free(slab, sk);
1148	else
1149		kfree(sk);
1150	return NULL;
1151}
1152
1153static void sk_prot_free(struct proto *prot, struct sock *sk)
1154{
1155	struct kmem_cache *slab;
1156	struct module *owner;
1157
1158	owner = prot->owner;
1159	slab = prot->slab;
1160
1161	security_sk_free(sk);
1162	if (slab != NULL)
1163		kmem_cache_free(slab, sk);
1164	else
1165		kfree(sk);
1166	module_put(owner);
1167}
1168
1169#ifdef CONFIG_CGROUPS
1170void sock_update_classid(struct sock *sk)
1171{
1172	u32 classid;
1173
1174	rcu_read_lock();  /* doing current task, which cannot vanish. */
1175	classid = task_cls_classid(current);
1176	rcu_read_unlock();
1177	if (classid && classid != sk->sk_classid)
1178		sk->sk_classid = classid;
1179}
1180EXPORT_SYMBOL(sock_update_classid);
1181
1182void sock_update_netprioidx(struct sock *sk)
1183{
1184	if (in_interrupt())
1185		return;
1186
1187	sk->sk_cgrp_prioidx = task_netprioidx(current);
1188}
1189EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1190#endif
1191
1192/**
1193 *	sk_alloc - All socket objects are allocated here
1194 *	@net: the applicable net namespace
1195 *	@family: protocol family
1196 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1197 *	@prot: struct proto associated with this new sock instance
1198 */
1199struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1200		      struct proto *prot)
1201{
1202	struct sock *sk;
1203
1204	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1205	if (sk) {
1206		sk->sk_family = family;
1207		/*
1208		 * See comment in struct sock definition to understand
1209		 * why we need sk_prot_creator -acme
1210		 */
1211		sk->sk_prot = sk->sk_prot_creator = prot;
1212		sock_lock_init(sk);
1213		sock_net_set(sk, get_net(net));
1214		atomic_set(&sk->sk_wmem_alloc, 1);
1215
1216		sock_update_classid(sk);
1217		sock_update_netprioidx(sk);
1218	}
1219
1220	return sk;
1221}
1222EXPORT_SYMBOL(sk_alloc);
1223
1224static void __sk_free(struct sock *sk)
1225{
1226	struct sk_filter *filter;
1227
1228	if (sk->sk_destruct)
1229		sk->sk_destruct(sk);
1230
1231	filter = rcu_dereference_check(sk->sk_filter,
1232				       atomic_read(&sk->sk_wmem_alloc) == 0);
1233	if (filter) {
1234		sk_filter_uncharge(sk, filter);
1235		RCU_INIT_POINTER(sk->sk_filter, NULL);
1236	}
1237
1238	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1239
1240	if (atomic_read(&sk->sk_omem_alloc))
1241		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1242		       __func__, atomic_read(&sk->sk_omem_alloc));
1243
1244	if (sk->sk_peer_cred)
1245		put_cred(sk->sk_peer_cred);
1246	put_pid(sk->sk_peer_pid);
1247	put_net(sock_net(sk));
1248	sk_prot_free(sk->sk_prot_creator, sk);
1249}
1250
1251void sk_free(struct sock *sk)
1252{
1253	/*
1254	 * We subtract one from sk_wmem_alloc and can know if
1255	 * some packets are still in some tx queue.
1256	 * If not null, sock_wfree() will call __sk_free(sk) later
1257	 */
1258	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1259		__sk_free(sk);
1260}
1261EXPORT_SYMBOL(sk_free);
1262
1263/*
1264 * Last sock_put should drop reference to sk->sk_net. It has already
1265 * been dropped in sk_change_net. Taking reference to stopping namespace
1266 * is not an option.
1267 * Take reference to a socket to remove it from hash _alive_ and after that
1268 * destroy it in the context of init_net.
1269 */
1270void sk_release_kernel(struct sock *sk)
1271{
1272	if (sk == NULL || sk->sk_socket == NULL)
1273		return;
1274
1275	sock_hold(sk);
1276	sock_release(sk->sk_socket);
1277	release_net(sock_net(sk));
1278	sock_net_set(sk, get_net(&init_net));
1279	sock_put(sk);
1280}
1281EXPORT_SYMBOL(sk_release_kernel);
1282
1283static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1284{
1285	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1286		sock_update_memcg(newsk);
1287}
1288
1289/**
1290 *	sk_clone_lock - clone a socket, and lock its clone
1291 *	@sk: the socket to clone
1292 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1293 *
1294 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1295 */
1296struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1297{
1298	struct sock *newsk;
1299
1300	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1301	if (newsk != NULL) {
1302		struct sk_filter *filter;
1303
1304		sock_copy(newsk, sk);
1305
1306		/* SANITY */
1307		get_net(sock_net(newsk));
1308		sk_node_init(&newsk->sk_node);
1309		sock_lock_init(newsk);
1310		bh_lock_sock(newsk);
1311		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1312		newsk->sk_backlog.len = 0;
1313
1314		atomic_set(&newsk->sk_rmem_alloc, 0);
1315		/*
1316		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1317		 */
1318		atomic_set(&newsk->sk_wmem_alloc, 1);
1319		atomic_set(&newsk->sk_omem_alloc, 0);
1320		skb_queue_head_init(&newsk->sk_receive_queue);
1321		skb_queue_head_init(&newsk->sk_write_queue);
1322#ifdef CONFIG_NET_DMA
1323		skb_queue_head_init(&newsk->sk_async_wait_queue);
1324#endif
1325
1326		spin_lock_init(&newsk->sk_dst_lock);
1327		rwlock_init(&newsk->sk_callback_lock);
1328		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1329				af_callback_keys + newsk->sk_family,
1330				af_family_clock_key_strings[newsk->sk_family]);
1331
1332		newsk->sk_dst_cache	= NULL;
1333		newsk->sk_wmem_queued	= 0;
1334		newsk->sk_forward_alloc = 0;
1335		newsk->sk_send_head	= NULL;
1336		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1337
1338		sock_reset_flag(newsk, SOCK_DONE);
1339		skb_queue_head_init(&newsk->sk_error_queue);
1340
1341		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1342		if (filter != NULL)
1343			sk_filter_charge(newsk, filter);
1344
1345		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1346			/* It is still raw copy of parent, so invalidate
1347			 * destructor and make plain sk_free() */
1348			newsk->sk_destruct = NULL;
1349			bh_unlock_sock(newsk);
1350			sk_free(newsk);
1351			newsk = NULL;
1352			goto out;
1353		}
1354
1355		newsk->sk_err	   = 0;
1356		newsk->sk_priority = 0;
1357		/*
1358		 * Before updating sk_refcnt, we must commit prior changes to memory
1359		 * (Documentation/RCU/rculist_nulls.txt for details)
1360		 */
1361		smp_wmb();
1362		atomic_set(&newsk->sk_refcnt, 2);
1363
1364		/*
1365		 * Increment the counter in the same struct proto as the master
1366		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1367		 * is the same as sk->sk_prot->socks, as this field was copied
1368		 * with memcpy).
1369		 *
1370		 * This _changes_ the previous behaviour, where
1371		 * tcp_create_openreq_child always was incrementing the
1372		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1373		 * to be taken into account in all callers. -acme
1374		 */
1375		sk_refcnt_debug_inc(newsk);
1376		sk_set_socket(newsk, NULL);
1377		newsk->sk_wq = NULL;
1378
1379		sk_update_clone(sk, newsk);
1380
1381		if (newsk->sk_prot->sockets_allocated)
1382			sk_sockets_allocated_inc(newsk);
1383
1384		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1385			net_enable_timestamp();
1386	}
1387out:
1388	return newsk;
1389}
1390EXPORT_SYMBOL_GPL(sk_clone_lock);
1391
1392void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1393{
1394	__sk_dst_set(sk, dst);
1395	sk->sk_route_caps = dst->dev->features;
1396	if (sk->sk_route_caps & NETIF_F_GSO)
1397		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1398	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1399	if (sk_can_gso(sk)) {
1400		if (dst->header_len) {
1401			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1402		} else {
1403			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1404			sk->sk_gso_max_size = dst->dev->gso_max_size;
1405		}
1406	}
1407}
1408EXPORT_SYMBOL_GPL(sk_setup_caps);
1409
1410void __init sk_init(void)
1411{
1412	if (totalram_pages <= 4096) {
1413		sysctl_wmem_max = 32767;
1414		sysctl_rmem_max = 32767;
1415		sysctl_wmem_default = 32767;
1416		sysctl_rmem_default = 32767;
1417	} else if (totalram_pages >= 131072) {
1418		sysctl_wmem_max = 131071;
1419		sysctl_rmem_max = 131071;
1420	}
1421}
1422
1423/*
1424 *	Simple resource managers for sockets.
1425 */
1426
1427
1428/*
1429 * Write buffer destructor automatically called from kfree_skb.
1430 */
1431void sock_wfree(struct sk_buff *skb)
1432{
1433	struct sock *sk = skb->sk;
1434	unsigned int len = skb->truesize;
1435
1436	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1437		/*
1438		 * Keep a reference on sk_wmem_alloc, this will be released
1439		 * after sk_write_space() call
1440		 */
1441		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1442		sk->sk_write_space(sk);
1443		len = 1;
1444	}
1445	/*
1446	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1447	 * could not do because of in-flight packets
1448	 */
1449	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1450		__sk_free(sk);
1451}
1452EXPORT_SYMBOL(sock_wfree);
1453
1454/*
1455 * Read buffer destructor automatically called from kfree_skb.
1456 */
1457void sock_rfree(struct sk_buff *skb)
1458{
1459	struct sock *sk = skb->sk;
1460	unsigned int len = skb->truesize;
1461
1462	atomic_sub(len, &sk->sk_rmem_alloc);
1463	sk_mem_uncharge(sk, len);
1464}
1465EXPORT_SYMBOL(sock_rfree);
1466
1467
1468int sock_i_uid(struct sock *sk)
1469{
1470	int uid;
1471
1472	read_lock_bh(&sk->sk_callback_lock);
1473	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1474	read_unlock_bh(&sk->sk_callback_lock);
1475	return uid;
1476}
1477EXPORT_SYMBOL(sock_i_uid);
1478
1479unsigned long sock_i_ino(struct sock *sk)
1480{
1481	unsigned long ino;
1482
1483	read_lock_bh(&sk->sk_callback_lock);
1484	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1485	read_unlock_bh(&sk->sk_callback_lock);
1486	return ino;
1487}
1488EXPORT_SYMBOL(sock_i_ino);
1489
1490/*
1491 * Allocate a skb from the socket's send buffer.
1492 */
1493struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1494			     gfp_t priority)
1495{
1496	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1497		struct sk_buff *skb = alloc_skb(size, priority);
1498		if (skb) {
1499			skb_set_owner_w(skb, sk);
1500			return skb;
1501		}
1502	}
1503	return NULL;
1504}
1505EXPORT_SYMBOL(sock_wmalloc);
1506
1507/*
1508 * Allocate a skb from the socket's receive buffer.
1509 */
1510struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1511			     gfp_t priority)
1512{
1513	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1514		struct sk_buff *skb = alloc_skb(size, priority);
1515		if (skb) {
1516			skb_set_owner_r(skb, sk);
1517			return skb;
1518		}
1519	}
1520	return NULL;
1521}
1522
1523/*
1524 * Allocate a memory block from the socket's option memory buffer.
1525 */
1526void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1527{
1528	if ((unsigned int)size <= sysctl_optmem_max &&
1529	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1530		void *mem;
1531		/* First do the add, to avoid the race if kmalloc
1532		 * might sleep.
1533		 */
1534		atomic_add(size, &sk->sk_omem_alloc);
1535		mem = kmalloc(size, priority);
1536		if (mem)
1537			return mem;
1538		atomic_sub(size, &sk->sk_omem_alloc);
1539	}
1540	return NULL;
1541}
1542EXPORT_SYMBOL(sock_kmalloc);
1543
1544/*
1545 * Free an option memory block.
1546 */
1547void sock_kfree_s(struct sock *sk, void *mem, int size)
1548{
1549	kfree(mem);
1550	atomic_sub(size, &sk->sk_omem_alloc);
1551}
1552EXPORT_SYMBOL(sock_kfree_s);
1553
1554/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1555   I think, these locks should be removed for datagram sockets.
1556 */
1557static long sock_wait_for_wmem(struct sock *sk, long timeo)
1558{
1559	DEFINE_WAIT(wait);
1560
1561	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1562	for (;;) {
1563		if (!timeo)
1564			break;
1565		if (signal_pending(current))
1566			break;
1567		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1568		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1569		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1570			break;
1571		if (sk->sk_shutdown & SEND_SHUTDOWN)
1572			break;
1573		if (sk->sk_err)
1574			break;
1575		timeo = schedule_timeout(timeo);
1576	}
1577	finish_wait(sk_sleep(sk), &wait);
1578	return timeo;
1579}
1580
1581
1582/*
1583 *	Generic send/receive buffer handlers
1584 */
1585
1586struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1587				     unsigned long data_len, int noblock,
1588				     int *errcode)
1589{
1590	struct sk_buff *skb;
1591	gfp_t gfp_mask;
1592	long timeo;
1593	int err;
1594
1595	gfp_mask = sk->sk_allocation;
1596	if (gfp_mask & __GFP_WAIT)
1597		gfp_mask |= __GFP_REPEAT;
1598
1599	timeo = sock_sndtimeo(sk, noblock);
1600	while (1) {
1601		err = sock_error(sk);
1602		if (err != 0)
1603			goto failure;
1604
1605		err = -EPIPE;
1606		if (sk->sk_shutdown & SEND_SHUTDOWN)
1607			goto failure;
1608
1609		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1610			skb = alloc_skb(header_len, gfp_mask);
1611			if (skb) {
1612				int npages;
1613				int i;
1614
1615				/* No pages, we're done... */
1616				if (!data_len)
1617					break;
1618
1619				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1620				skb->truesize += data_len;
1621				skb_shinfo(skb)->nr_frags = npages;
1622				for (i = 0; i < npages; i++) {
1623					struct page *page;
1624
1625					page = alloc_pages(sk->sk_allocation, 0);
1626					if (!page) {
1627						err = -ENOBUFS;
1628						skb_shinfo(skb)->nr_frags = i;
1629						kfree_skb(skb);
1630						goto failure;
1631					}
1632
1633					__skb_fill_page_desc(skb, i,
1634							page, 0,
1635							(data_len >= PAGE_SIZE ?
1636							 PAGE_SIZE :
1637							 data_len));
1638					data_len -= PAGE_SIZE;
1639				}
1640
1641				/* Full success... */
1642				break;
1643			}
1644			err = -ENOBUFS;
1645			goto failure;
1646		}
1647		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1648		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1649		err = -EAGAIN;
1650		if (!timeo)
1651			goto failure;
1652		if (signal_pending(current))
1653			goto interrupted;
1654		timeo = sock_wait_for_wmem(sk, timeo);
1655	}
1656
1657	skb_set_owner_w(skb, sk);
1658	return skb;
1659
1660interrupted:
1661	err = sock_intr_errno(timeo);
1662failure:
1663	*errcode = err;
1664	return NULL;
1665}
1666EXPORT_SYMBOL(sock_alloc_send_pskb);
1667
1668struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1669				    int noblock, int *errcode)
1670{
1671	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1672}
1673EXPORT_SYMBOL(sock_alloc_send_skb);
1674
1675static void __lock_sock(struct sock *sk)
1676	__releases(&sk->sk_lock.slock)
1677	__acquires(&sk->sk_lock.slock)
1678{
1679	DEFINE_WAIT(wait);
1680
1681	for (;;) {
1682		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1683					TASK_UNINTERRUPTIBLE);
1684		spin_unlock_bh(&sk->sk_lock.slock);
1685		schedule();
1686		spin_lock_bh(&sk->sk_lock.slock);
1687		if (!sock_owned_by_user(sk))
1688			break;
1689	}
1690	finish_wait(&sk->sk_lock.wq, &wait);
1691}
1692
1693static void __release_sock(struct sock *sk)
1694	__releases(&sk->sk_lock.slock)
1695	__acquires(&sk->sk_lock.slock)
1696{
1697	struct sk_buff *skb = sk->sk_backlog.head;
1698
1699	do {
1700		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1701		bh_unlock_sock(sk);
1702
1703		do {
1704			struct sk_buff *next = skb->next;
1705
1706			prefetch(next);
1707			WARN_ON_ONCE(skb_dst_is_noref(skb));
1708			skb->next = NULL;
1709			sk_backlog_rcv(sk, skb);
1710
1711			/*
1712			 * We are in process context here with softirqs
1713			 * disabled, use cond_resched_softirq() to preempt.
1714			 * This is safe to do because we've taken the backlog
1715			 * queue private:
1716			 */
1717			cond_resched_softirq();
1718
1719			skb = next;
1720		} while (skb != NULL);
1721
1722		bh_lock_sock(sk);
1723	} while ((skb = sk->sk_backlog.head) != NULL);
1724
1725	/*
1726	 * Doing the zeroing here guarantee we can not loop forever
1727	 * while a wild producer attempts to flood us.
1728	 */
1729	sk->sk_backlog.len = 0;
1730}
1731
1732/**
1733 * sk_wait_data - wait for data to arrive at sk_receive_queue
1734 * @sk:    sock to wait on
1735 * @timeo: for how long
1736 *
1737 * Now socket state including sk->sk_err is changed only under lock,
1738 * hence we may omit checks after joining wait queue.
1739 * We check receive queue before schedule() only as optimization;
1740 * it is very likely that release_sock() added new data.
1741 */
1742int sk_wait_data(struct sock *sk, long *timeo)
1743{
1744	int rc;
1745	DEFINE_WAIT(wait);
1746
1747	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1748	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1749	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1750	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1751	finish_wait(sk_sleep(sk), &wait);
1752	return rc;
1753}
1754EXPORT_SYMBOL(sk_wait_data);
1755
1756/**
1757 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1758 *	@sk: socket
1759 *	@size: memory size to allocate
1760 *	@kind: allocation type
1761 *
1762 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1763 *	rmem allocation. This function assumes that protocols which have
1764 *	memory_pressure use sk_wmem_queued as write buffer accounting.
1765 */
1766int __sk_mem_schedule(struct sock *sk, int size, int kind)
1767{
1768	struct proto *prot = sk->sk_prot;
1769	int amt = sk_mem_pages(size);
1770	long allocated;
1771	int parent_status = UNDER_LIMIT;
1772
1773	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1774
1775	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1776
1777	/* Under limit. */
1778	if (parent_status == UNDER_LIMIT &&
1779			allocated <= sk_prot_mem_limits(sk, 0)) {
1780		sk_leave_memory_pressure(sk);
1781		return 1;
1782	}
1783
1784	/* Under pressure. (we or our parents) */
1785	if ((parent_status > SOFT_LIMIT) ||
1786			allocated > sk_prot_mem_limits(sk, 1))
1787		sk_enter_memory_pressure(sk);
1788
1789	/* Over hard limit (we or our parents) */
1790	if ((parent_status == OVER_LIMIT) ||
1791			(allocated > sk_prot_mem_limits(sk, 2)))
1792		goto suppress_allocation;
1793
1794	/* guarantee minimum buffer size under pressure */
1795	if (kind == SK_MEM_RECV) {
1796		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1797			return 1;
1798
1799	} else { /* SK_MEM_SEND */
1800		if (sk->sk_type == SOCK_STREAM) {
1801			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1802				return 1;
1803		} else if (atomic_read(&sk->sk_wmem_alloc) <
1804			   prot->sysctl_wmem[0])
1805				return 1;
1806	}
1807
1808	if (sk_has_memory_pressure(sk)) {
1809		int alloc;
1810
1811		if (!sk_under_memory_pressure(sk))
1812			return 1;
1813		alloc = sk_sockets_allocated_read_positive(sk);
1814		if (sk_prot_mem_limits(sk, 2) > alloc *
1815		    sk_mem_pages(sk->sk_wmem_queued +
1816				 atomic_read(&sk->sk_rmem_alloc) +
1817				 sk->sk_forward_alloc))
1818			return 1;
1819	}
1820
1821suppress_allocation:
1822
1823	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1824		sk_stream_moderate_sndbuf(sk);
1825
1826		/* Fail only if socket is _under_ its sndbuf.
1827		 * In this case we cannot block, so that we have to fail.
1828		 */
1829		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1830			return 1;
1831	}
1832
1833	trace_sock_exceed_buf_limit(sk, prot, allocated);
1834
1835	/* Alas. Undo changes. */
1836	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1837
1838	sk_memory_allocated_sub(sk, amt);
1839
1840	return 0;
1841}
1842EXPORT_SYMBOL(__sk_mem_schedule);
1843
1844/**
1845 *	__sk_reclaim - reclaim memory_allocated
1846 *	@sk: socket
1847 */
1848void __sk_mem_reclaim(struct sock *sk)
1849{
1850	sk_memory_allocated_sub(sk,
1851				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
1852	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1853
1854	if (sk_under_memory_pressure(sk) &&
1855	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1856		sk_leave_memory_pressure(sk);
1857}
1858EXPORT_SYMBOL(__sk_mem_reclaim);
1859
1860
1861/*
1862 * Set of default routines for initialising struct proto_ops when
1863 * the protocol does not support a particular function. In certain
1864 * cases where it makes no sense for a protocol to have a "do nothing"
1865 * function, some default processing is provided.
1866 */
1867
1868int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1869{
1870	return -EOPNOTSUPP;
1871}
1872EXPORT_SYMBOL(sock_no_bind);
1873
1874int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1875		    int len, int flags)
1876{
1877	return -EOPNOTSUPP;
1878}
1879EXPORT_SYMBOL(sock_no_connect);
1880
1881int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1882{
1883	return -EOPNOTSUPP;
1884}
1885EXPORT_SYMBOL(sock_no_socketpair);
1886
1887int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1888{
1889	return -EOPNOTSUPP;
1890}
1891EXPORT_SYMBOL(sock_no_accept);
1892
1893int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1894		    int *len, int peer)
1895{
1896	return -EOPNOTSUPP;
1897}
1898EXPORT_SYMBOL(sock_no_getname);
1899
1900unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1901{
1902	return 0;
1903}
1904EXPORT_SYMBOL(sock_no_poll);
1905
1906int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1907{
1908	return -EOPNOTSUPP;
1909}
1910EXPORT_SYMBOL(sock_no_ioctl);
1911
1912int sock_no_listen(struct socket *sock, int backlog)
1913{
1914	return -EOPNOTSUPP;
1915}
1916EXPORT_SYMBOL(sock_no_listen);
1917
1918int sock_no_shutdown(struct socket *sock, int how)
1919{
1920	return -EOPNOTSUPP;
1921}
1922EXPORT_SYMBOL(sock_no_shutdown);
1923
1924int sock_no_setsockopt(struct socket *sock, int level, int optname,
1925		    char __user *optval, unsigned int optlen)
1926{
1927	return -EOPNOTSUPP;
1928}
1929EXPORT_SYMBOL(sock_no_setsockopt);
1930
1931int sock_no_getsockopt(struct socket *sock, int level, int optname,
1932		    char __user *optval, int __user *optlen)
1933{
1934	return -EOPNOTSUPP;
1935}
1936EXPORT_SYMBOL(sock_no_getsockopt);
1937
1938int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1939		    size_t len)
1940{
1941	return -EOPNOTSUPP;
1942}
1943EXPORT_SYMBOL(sock_no_sendmsg);
1944
1945int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1946		    size_t len, int flags)
1947{
1948	return -EOPNOTSUPP;
1949}
1950EXPORT_SYMBOL(sock_no_recvmsg);
1951
1952int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1953{
1954	/* Mirror missing mmap method error code */
1955	return -ENODEV;
1956}
1957EXPORT_SYMBOL(sock_no_mmap);
1958
1959ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1960{
1961	ssize_t res;
1962	struct msghdr msg = {.msg_flags = flags};
1963	struct kvec iov;
1964	char *kaddr = kmap(page);
1965	iov.iov_base = kaddr + offset;
1966	iov.iov_len = size;
1967	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1968	kunmap(page);
1969	return res;
1970}
1971EXPORT_SYMBOL(sock_no_sendpage);
1972
1973/*
1974 *	Default Socket Callbacks
1975 */
1976
1977static void sock_def_wakeup(struct sock *sk)
1978{
1979	struct socket_wq *wq;
1980
1981	rcu_read_lock();
1982	wq = rcu_dereference(sk->sk_wq);
1983	if (wq_has_sleeper(wq))
1984		wake_up_interruptible_all(&wq->wait);
1985	rcu_read_unlock();
1986}
1987
1988static void sock_def_error_report(struct sock *sk)
1989{
1990	struct socket_wq *wq;
1991
1992	rcu_read_lock();
1993	wq = rcu_dereference(sk->sk_wq);
1994	if (wq_has_sleeper(wq))
1995		wake_up_interruptible_poll(&wq->wait, POLLERR);
1996	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1997	rcu_read_unlock();
1998}
1999
2000static void sock_def_readable(struct sock *sk, int len)
2001{
2002	struct socket_wq *wq;
2003
2004	rcu_read_lock();
2005	wq = rcu_dereference(sk->sk_wq);
2006	if (wq_has_sleeper(wq))
2007		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2008						POLLRDNORM | POLLRDBAND);
2009	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2010	rcu_read_unlock();
2011}
2012
2013static void sock_def_write_space(struct sock *sk)
2014{
2015	struct socket_wq *wq;
2016
2017	rcu_read_lock();
2018
2019	/* Do not wake up a writer until he can make "significant"
2020	 * progress.  --DaveM
2021	 */
2022	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2023		wq = rcu_dereference(sk->sk_wq);
2024		if (wq_has_sleeper(wq))
2025			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2026						POLLWRNORM | POLLWRBAND);
2027
2028		/* Should agree with poll, otherwise some programs break */
2029		if (sock_writeable(sk))
2030			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2031	}
2032
2033	rcu_read_unlock();
2034}
2035
2036static void sock_def_destruct(struct sock *sk)
2037{
2038	kfree(sk->sk_protinfo);
2039}
2040
2041void sk_send_sigurg(struct sock *sk)
2042{
2043	if (sk->sk_socket && sk->sk_socket->file)
2044		if (send_sigurg(&sk->sk_socket->file->f_owner))
2045			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2046}
2047EXPORT_SYMBOL(sk_send_sigurg);
2048
2049void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2050		    unsigned long expires)
2051{
2052	if (!mod_timer(timer, expires))
2053		sock_hold(sk);
2054}
2055EXPORT_SYMBOL(sk_reset_timer);
2056
2057void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2058{
2059	if (timer_pending(timer) && del_timer(timer))
2060		__sock_put(sk);
2061}
2062EXPORT_SYMBOL(sk_stop_timer);
2063
2064void sock_init_data(struct socket *sock, struct sock *sk)
2065{
2066	skb_queue_head_init(&sk->sk_receive_queue);
2067	skb_queue_head_init(&sk->sk_write_queue);
2068	skb_queue_head_init(&sk->sk_error_queue);
2069#ifdef CONFIG_NET_DMA
2070	skb_queue_head_init(&sk->sk_async_wait_queue);
2071#endif
2072
2073	sk->sk_send_head	=	NULL;
2074
2075	init_timer(&sk->sk_timer);
2076
2077	sk->sk_allocation	=	GFP_KERNEL;
2078	sk->sk_rcvbuf		=	sysctl_rmem_default;
2079	sk->sk_sndbuf		=	sysctl_wmem_default;
2080	sk->sk_state		=	TCP_CLOSE;
2081	sk_set_socket(sk, sock);
2082
2083	sock_set_flag(sk, SOCK_ZAPPED);
2084
2085	if (sock) {
2086		sk->sk_type	=	sock->type;
2087		sk->sk_wq	=	sock->wq;
2088		sock->sk	=	sk;
2089	} else
2090		sk->sk_wq	=	NULL;
2091
2092	spin_lock_init(&sk->sk_dst_lock);
2093	rwlock_init(&sk->sk_callback_lock);
2094	lockdep_set_class_and_name(&sk->sk_callback_lock,
2095			af_callback_keys + sk->sk_family,
2096			af_family_clock_key_strings[sk->sk_family]);
2097
2098	sk->sk_state_change	=	sock_def_wakeup;
2099	sk->sk_data_ready	=	sock_def_readable;
2100	sk->sk_write_space	=	sock_def_write_space;
2101	sk->sk_error_report	=	sock_def_error_report;
2102	sk->sk_destruct		=	sock_def_destruct;
2103
2104	sk->sk_sndmsg_page	=	NULL;
2105	sk->sk_sndmsg_off	=	0;
2106	sk->sk_peek_off		=	-1;
2107
2108	sk->sk_peer_pid 	=	NULL;
2109	sk->sk_peer_cred	=	NULL;
2110	sk->sk_write_pending	=	0;
2111	sk->sk_rcvlowat		=	1;
2112	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2113	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2114
2115	sk->sk_stamp = ktime_set(-1L, 0);
2116
2117	/*
2118	 * Before updating sk_refcnt, we must commit prior changes to memory
2119	 * (Documentation/RCU/rculist_nulls.txt for details)
2120	 */
2121	smp_wmb();
2122	atomic_set(&sk->sk_refcnt, 1);
2123	atomic_set(&sk->sk_drops, 0);
2124}
2125EXPORT_SYMBOL(sock_init_data);
2126
2127void lock_sock_nested(struct sock *sk, int subclass)
2128{
2129	might_sleep();
2130	spin_lock_bh(&sk->sk_lock.slock);
2131	if (sk->sk_lock.owned)
2132		__lock_sock(sk);
2133	sk->sk_lock.owned = 1;
2134	spin_unlock(&sk->sk_lock.slock);
2135	/*
2136	 * The sk_lock has mutex_lock() semantics here:
2137	 */
2138	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2139	local_bh_enable();
2140}
2141EXPORT_SYMBOL(lock_sock_nested);
2142
2143void release_sock(struct sock *sk)
2144{
2145	/*
2146	 * The sk_lock has mutex_unlock() semantics:
2147	 */
2148	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2149
2150	spin_lock_bh(&sk->sk_lock.slock);
2151	if (sk->sk_backlog.tail)
2152		__release_sock(sk);
2153	sk->sk_lock.owned = 0;
2154	if (waitqueue_active(&sk->sk_lock.wq))
2155		wake_up(&sk->sk_lock.wq);
2156	spin_unlock_bh(&sk->sk_lock.slock);
2157}
2158EXPORT_SYMBOL(release_sock);
2159
2160/**
2161 * lock_sock_fast - fast version of lock_sock
2162 * @sk: socket
2163 *
2164 * This version should be used for very small section, where process wont block
2165 * return false if fast path is taken
2166 *   sk_lock.slock locked, owned = 0, BH disabled
2167 * return true if slow path is taken
2168 *   sk_lock.slock unlocked, owned = 1, BH enabled
2169 */
2170bool lock_sock_fast(struct sock *sk)
2171{
2172	might_sleep();
2173	spin_lock_bh(&sk->sk_lock.slock);
2174
2175	if (!sk->sk_lock.owned)
2176		/*
2177		 * Note : We must disable BH
2178		 */
2179		return false;
2180
2181	__lock_sock(sk);
2182	sk->sk_lock.owned = 1;
2183	spin_unlock(&sk->sk_lock.slock);
2184	/*
2185	 * The sk_lock has mutex_lock() semantics here:
2186	 */
2187	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2188	local_bh_enable();
2189	return true;
2190}
2191EXPORT_SYMBOL(lock_sock_fast);
2192
2193int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2194{
2195	struct timeval tv;
2196	if (!sock_flag(sk, SOCK_TIMESTAMP))
2197		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2198	tv = ktime_to_timeval(sk->sk_stamp);
2199	if (tv.tv_sec == -1)
2200		return -ENOENT;
2201	if (tv.tv_sec == 0) {
2202		sk->sk_stamp = ktime_get_real();
2203		tv = ktime_to_timeval(sk->sk_stamp);
2204	}
2205	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2206}
2207EXPORT_SYMBOL(sock_get_timestamp);
2208
2209int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2210{
2211	struct timespec ts;
2212	if (!sock_flag(sk, SOCK_TIMESTAMP))
2213		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2214	ts = ktime_to_timespec(sk->sk_stamp);
2215	if (ts.tv_sec == -1)
2216		return -ENOENT;
2217	if (ts.tv_sec == 0) {
2218		sk->sk_stamp = ktime_get_real();
2219		ts = ktime_to_timespec(sk->sk_stamp);
2220	}
2221	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2222}
2223EXPORT_SYMBOL(sock_get_timestampns);
2224
2225void sock_enable_timestamp(struct sock *sk, int flag)
2226{
2227	if (!sock_flag(sk, flag)) {
2228		unsigned long previous_flags = sk->sk_flags;
2229
2230		sock_set_flag(sk, flag);
2231		/*
2232		 * we just set one of the two flags which require net
2233		 * time stamping, but time stamping might have been on
2234		 * already because of the other one
2235		 */
2236		if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2237			net_enable_timestamp();
2238	}
2239}
2240
2241/*
2242 *	Get a socket option on an socket.
2243 *
2244 *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2245 *	asynchronous errors should be reported by getsockopt. We assume
2246 *	this means if you specify SO_ERROR (otherwise whats the point of it).
2247 */
2248int sock_common_getsockopt(struct socket *sock, int level, int optname,
2249			   char __user *optval, int __user *optlen)
2250{
2251	struct sock *sk = sock->sk;
2252
2253	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2254}
2255EXPORT_SYMBOL(sock_common_getsockopt);
2256
2257#ifdef CONFIG_COMPAT
2258int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2259				  char __user *optval, int __user *optlen)
2260{
2261	struct sock *sk = sock->sk;
2262
2263	if (sk->sk_prot->compat_getsockopt != NULL)
2264		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2265						      optval, optlen);
2266	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2267}
2268EXPORT_SYMBOL(compat_sock_common_getsockopt);
2269#endif
2270
2271int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2272			struct msghdr *msg, size_t size, int flags)
2273{
2274	struct sock *sk = sock->sk;
2275	int addr_len = 0;
2276	int err;
2277
2278	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2279				   flags & ~MSG_DONTWAIT, &addr_len);
2280	if (err >= 0)
2281		msg->msg_namelen = addr_len;
2282	return err;
2283}
2284EXPORT_SYMBOL(sock_common_recvmsg);
2285
2286/*
2287 *	Set socket options on an inet socket.
2288 */
2289int sock_common_setsockopt(struct socket *sock, int level, int optname,
2290			   char __user *optval, unsigned int optlen)
2291{
2292	struct sock *sk = sock->sk;
2293
2294	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2295}
2296EXPORT_SYMBOL(sock_common_setsockopt);
2297
2298#ifdef CONFIG_COMPAT
2299int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2300				  char __user *optval, unsigned int optlen)
2301{
2302	struct sock *sk = sock->sk;
2303
2304	if (sk->sk_prot->compat_setsockopt != NULL)
2305		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2306						      optval, optlen);
2307	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2308}
2309EXPORT_SYMBOL(compat_sock_common_setsockopt);
2310#endif
2311
2312void sk_common_release(struct sock *sk)
2313{
2314	if (sk->sk_prot->destroy)
2315		sk->sk_prot->destroy(sk);
2316
2317	/*
2318	 * Observation: when sock_common_release is called, processes have
2319	 * no access to socket. But net still has.
2320	 * Step one, detach it from networking:
2321	 *
2322	 * A. Remove from hash tables.
2323	 */
2324
2325	sk->sk_prot->unhash(sk);
2326
2327	/*
2328	 * In this point socket cannot receive new packets, but it is possible
2329	 * that some packets are in flight because some CPU runs receiver and
2330	 * did hash table lookup before we unhashed socket. They will achieve
2331	 * receive queue and will be purged by socket destructor.
2332	 *
2333	 * Also we still have packets pending on receive queue and probably,
2334	 * our own packets waiting in device queues. sock_destroy will drain
2335	 * receive queue, but transmitted packets will delay socket destruction
2336	 * until the last reference will be released.
2337	 */
2338
2339	sock_orphan(sk);
2340
2341	xfrm_sk_free_policy(sk);
2342
2343	sk_refcnt_debug_release(sk);
2344	sock_put(sk);
2345}
2346EXPORT_SYMBOL(sk_common_release);
2347
2348#ifdef CONFIG_PROC_FS
2349#define PROTO_INUSE_NR	64	/* should be enough for the first time */
2350struct prot_inuse {
2351	int val[PROTO_INUSE_NR];
2352};
2353
2354static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2355
2356#ifdef CONFIG_NET_NS
2357void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2358{
2359	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2360}
2361EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2362
2363int sock_prot_inuse_get(struct net *net, struct proto *prot)
2364{
2365	int cpu, idx = prot->inuse_idx;
2366	int res = 0;
2367
2368	for_each_possible_cpu(cpu)
2369		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2370
2371	return res >= 0 ? res : 0;
2372}
2373EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2374
2375static int __net_init sock_inuse_init_net(struct net *net)
2376{
2377	net->core.inuse = alloc_percpu(struct prot_inuse);
2378	return net->core.inuse ? 0 : -ENOMEM;
2379}
2380
2381static void __net_exit sock_inuse_exit_net(struct net *net)
2382{
2383	free_percpu(net->core.inuse);
2384}
2385
2386static struct pernet_operations net_inuse_ops = {
2387	.init = sock_inuse_init_net,
2388	.exit = sock_inuse_exit_net,
2389};
2390
2391static __init int net_inuse_init(void)
2392{
2393	if (register_pernet_subsys(&net_inuse_ops))
2394		panic("Cannot initialize net inuse counters");
2395
2396	return 0;
2397}
2398
2399core_initcall(net_inuse_init);
2400#else
2401static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2402
2403void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2404{
2405	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2406}
2407EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2408
2409int sock_prot_inuse_get(struct net *net, struct proto *prot)
2410{
2411	int cpu, idx = prot->inuse_idx;
2412	int res = 0;
2413
2414	for_each_possible_cpu(cpu)
2415		res += per_cpu(prot_inuse, cpu).val[idx];
2416
2417	return res >= 0 ? res : 0;
2418}
2419EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2420#endif
2421
2422static void assign_proto_idx(struct proto *prot)
2423{
2424	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2425
2426	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2427		printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2428		return;
2429	}
2430
2431	set_bit(prot->inuse_idx, proto_inuse_idx);
2432}
2433
2434static void release_proto_idx(struct proto *prot)
2435{
2436	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2437		clear_bit(prot->inuse_idx, proto_inuse_idx);
2438}
2439#else
2440static inline void assign_proto_idx(struct proto *prot)
2441{
2442}
2443
2444static inline void release_proto_idx(struct proto *prot)
2445{
2446}
2447#endif
2448
2449int proto_register(struct proto *prot, int alloc_slab)
2450{
2451	if (alloc_slab) {
2452		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2453					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2454					NULL);
2455
2456		if (prot->slab == NULL) {
2457			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2458			       prot->name);
2459			goto out;
2460		}
2461
2462		if (prot->rsk_prot != NULL) {
2463			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2464			if (prot->rsk_prot->slab_name == NULL)
2465				goto out_free_sock_slab;
2466
2467			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2468								 prot->rsk_prot->obj_size, 0,
2469								 SLAB_HWCACHE_ALIGN, NULL);
2470
2471			if (prot->rsk_prot->slab == NULL) {
2472				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2473				       prot->name);
2474				goto out_free_request_sock_slab_name;
2475			}
2476		}
2477
2478		if (prot->twsk_prot != NULL) {
2479			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2480
2481			if (prot->twsk_prot->twsk_slab_name == NULL)
2482				goto out_free_request_sock_slab;
2483
2484			prot->twsk_prot->twsk_slab =
2485				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2486						  prot->twsk_prot->twsk_obj_size,
2487						  0,
2488						  SLAB_HWCACHE_ALIGN |
2489							prot->slab_flags,
2490						  NULL);
2491			if (prot->twsk_prot->twsk_slab == NULL)
2492				goto out_free_timewait_sock_slab_name;
2493		}
2494	}
2495
2496	mutex_lock(&proto_list_mutex);
2497	list_add(&prot->node, &proto_list);
2498	assign_proto_idx(prot);
2499	mutex_unlock(&proto_list_mutex);
2500	return 0;
2501
2502out_free_timewait_sock_slab_name:
2503	kfree(prot->twsk_prot->twsk_slab_name);
2504out_free_request_sock_slab:
2505	if (prot->rsk_prot && prot->rsk_prot->slab) {
2506		kmem_cache_destroy(prot->rsk_prot->slab);
2507		prot->rsk_prot->slab = NULL;
2508	}
2509out_free_request_sock_slab_name:
2510	if (prot->rsk_prot)
2511		kfree(prot->rsk_prot->slab_name);
2512out_free_sock_slab:
2513	kmem_cache_destroy(prot->slab);
2514	prot->slab = NULL;
2515out:
2516	return -ENOBUFS;
2517}
2518EXPORT_SYMBOL(proto_register);
2519
2520void proto_unregister(struct proto *prot)
2521{
2522	mutex_lock(&proto_list_mutex);
2523	release_proto_idx(prot);
2524	list_del(&prot->node);
2525	mutex_unlock(&proto_list_mutex);
2526
2527	if (prot->slab != NULL) {
2528		kmem_cache_destroy(prot->slab);
2529		prot->slab = NULL;
2530	}
2531
2532	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2533		kmem_cache_destroy(prot->rsk_prot->slab);
2534		kfree(prot->rsk_prot->slab_name);
2535		prot->rsk_prot->slab = NULL;
2536	}
2537
2538	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2539		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2540		kfree(prot->twsk_prot->twsk_slab_name);
2541		prot->twsk_prot->twsk_slab = NULL;
2542	}
2543}
2544EXPORT_SYMBOL(proto_unregister);
2545
2546#ifdef CONFIG_PROC_FS
2547static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2548	__acquires(proto_list_mutex)
2549{
2550	mutex_lock(&proto_list_mutex);
2551	return seq_list_start_head(&proto_list, *pos);
2552}
2553
2554static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2555{
2556	return seq_list_next(v, &proto_list, pos);
2557}
2558
2559static void proto_seq_stop(struct seq_file *seq, void *v)
2560	__releases(proto_list_mutex)
2561{
2562	mutex_unlock(&proto_list_mutex);
2563}
2564
2565static char proto_method_implemented(const void *method)
2566{
2567	return method == NULL ? 'n' : 'y';
2568}
2569static long sock_prot_memory_allocated(struct proto *proto)
2570{
2571	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2572}
2573
2574static char *sock_prot_memory_pressure(struct proto *proto)
2575{
2576	return proto->memory_pressure != NULL ?
2577	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2578}
2579
2580static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2581{
2582
2583	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2584			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2585		   proto->name,
2586		   proto->obj_size,
2587		   sock_prot_inuse_get(seq_file_net(seq), proto),
2588		   sock_prot_memory_allocated(proto),
2589		   sock_prot_memory_pressure(proto),
2590		   proto->max_header,
2591		   proto->slab == NULL ? "no" : "yes",
2592		   module_name(proto->owner),
2593		   proto_method_implemented(proto->close),
2594		   proto_method_implemented(proto->connect),
2595		   proto_method_implemented(proto->disconnect),
2596		   proto_method_implemented(proto->accept),
2597		   proto_method_implemented(proto->ioctl),
2598		   proto_method_implemented(proto->init),
2599		   proto_method_implemented(proto->destroy),
2600		   proto_method_implemented(proto->shutdown),
2601		   proto_method_implemented(proto->setsockopt),
2602		   proto_method_implemented(proto->getsockopt),
2603		   proto_method_implemented(proto->sendmsg),
2604		   proto_method_implemented(proto->recvmsg),
2605		   proto_method_implemented(proto->sendpage),
2606		   proto_method_implemented(proto->bind),
2607		   proto_method_implemented(proto->backlog_rcv),
2608		   proto_method_implemented(proto->hash),
2609		   proto_method_implemented(proto->unhash),
2610		   proto_method_implemented(proto->get_port),
2611		   proto_method_implemented(proto->enter_memory_pressure));
2612}
2613
2614static int proto_seq_show(struct seq_file *seq, void *v)
2615{
2616	if (v == &proto_list)
2617		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2618			   "protocol",
2619			   "size",
2620			   "sockets",
2621			   "memory",
2622			   "press",
2623			   "maxhdr",
2624			   "slab",
2625			   "module",
2626			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2627	else
2628		proto_seq_printf(seq, list_entry(v, struct proto, node));
2629	return 0;
2630}
2631
2632static const struct seq_operations proto_seq_ops = {
2633	.start  = proto_seq_start,
2634	.next   = proto_seq_next,
2635	.stop   = proto_seq_stop,
2636	.show   = proto_seq_show,
2637};
2638
2639static int proto_seq_open(struct inode *inode, struct file *file)
2640{
2641	return seq_open_net(inode, file, &proto_seq_ops,
2642			    sizeof(struct seq_net_private));
2643}
2644
2645static const struct file_operations proto_seq_fops = {
2646	.owner		= THIS_MODULE,
2647	.open		= proto_seq_open,
2648	.read		= seq_read,
2649	.llseek		= seq_lseek,
2650	.release	= seq_release_net,
2651};
2652
2653static __net_init int proto_init_net(struct net *net)
2654{
2655	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2656		return -ENOMEM;
2657
2658	return 0;
2659}
2660
2661static __net_exit void proto_exit_net(struct net *net)
2662{
2663	proc_net_remove(net, "protocols");
2664}
2665
2666
2667static __net_initdata struct pernet_operations proto_net_ops = {
2668	.init = proto_init_net,
2669	.exit = proto_exit_net,
2670};
2671
2672static int __init proto_init(void)
2673{
2674	return register_pernet_subsys(&proto_net_ops);
2675}
2676
2677subsys_initcall(proto_init);
2678
2679#endif /* PROC_FS */
2680