af_unix.c revision 12663bfc97c8b3fdb292428105dd92d563164050
1/*
2 * NET4:	Implementation of BSD Unix domain sockets.
3 *
4 * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
5 *
6 *		This program is free software; you can redistribute it and/or
7 *		modify it under the terms of the GNU General Public License
8 *		as published by the Free Software Foundation; either version
9 *		2 of the License, or (at your option) any later version.
10 *
11 * Fixes:
12 *		Linus Torvalds	:	Assorted bug cures.
13 *		Niibe Yutaka	:	async I/O support.
14 *		Carsten Paeth	:	PF_UNIX check, address fixes.
15 *		Alan Cox	:	Limit size of allocated blocks.
16 *		Alan Cox	:	Fixed the stupid socketpair bug.
17 *		Alan Cox	:	BSD compatibility fine tuning.
18 *		Alan Cox	:	Fixed a bug in connect when interrupted.
19 *		Alan Cox	:	Sorted out a proper draft version of
20 *					file descriptor passing hacked up from
21 *					Mike Shaver's work.
22 *		Marty Leisner	:	Fixes to fd passing
23 *		Nick Nevin	:	recvmsg bugfix.
24 *		Alan Cox	:	Started proper garbage collector
25 *		Heiko EiBfeldt	:	Missing verify_area check
26 *		Alan Cox	:	Started POSIXisms
27 *		Andreas Schwab	:	Replace inode by dentry for proper
28 *					reference counting
29 *		Kirk Petersen	:	Made this a module
30 *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
31 *					Lots of bug fixes.
32 *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
33 *					by above two patches.
34 *	     Andrea Arcangeli	:	If possible we block in connect(2)
35 *					if the max backlog of the listen socket
36 *					is been reached. This won't break
37 *					old apps and it will avoid huge amount
38 *					of socks hashed (this for unix_gc()
39 *					performances reasons).
40 *					Security fix that limits the max
41 *					number of socks to 2*max_files and
42 *					the number of skb queueable in the
43 *					dgram receiver.
44 *		Artur Skawina   :	Hash function optimizations
45 *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
46 *	      Malcolm Beattie   :	Set peercred for socketpair
47 *	     Michal Ostrowski   :       Module initialization cleanup.
48 *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
49 *	     				the core infrastructure is doing that
50 *	     				for all net proto families now (2.5.69+)
51 *
52 *
53 * Known differences from reference BSD that was tested:
54 *
55 *	[TO FIX]
56 *	ECONNREFUSED is not returned from one end of a connected() socket to the
57 *		other the moment one end closes.
58 *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
59 *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
60 *	[NOT TO FIX]
61 *	accept() returns a path name even if the connecting socket has closed
62 *		in the meantime (BSD loses the path and gives up).
63 *	accept() returns 0 length path for an unbound connector. BSD returns 16
64 *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65 *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
66 *	BSD af_unix apparently has connect forgetting to block properly.
67 *		(need to check this with the POSIX spec in detail)
68 *
69 * Differences from 2.0.0-11-... (ANK)
70 *	Bug fixes and improvements.
71 *		- client shutdown killed server socket.
72 *		- removed all useless cli/sti pairs.
73 *
74 *	Semantic changes/extensions.
75 *		- generic control message passing.
76 *		- SCM_CREDENTIALS control message.
77 *		- "Abstract" (not FS based) socket bindings.
78 *		  Abstract names are sequences of bytes (not zero terminated)
79 *		  started by 0, so that this name space does not intersect
80 *		  with BSD names.
81 */
82
83#include <linux/module.h>
84#include <linux/kernel.h>
85#include <linux/signal.h>
86#include <linux/sched.h>
87#include <linux/errno.h>
88#include <linux/string.h>
89#include <linux/stat.h>
90#include <linux/dcache.h>
91#include <linux/namei.h>
92#include <linux/socket.h>
93#include <linux/un.h>
94#include <linux/fcntl.h>
95#include <linux/termios.h>
96#include <linux/sockios.h>
97#include <linux/net.h>
98#include <linux/in.h>
99#include <linux/fs.h>
100#include <linux/slab.h>
101#include <asm/uaccess.h>
102#include <linux/skbuff.h>
103#include <linux/netdevice.h>
104#include <net/net_namespace.h>
105#include <net/sock.h>
106#include <net/tcp_states.h>
107#include <net/af_unix.h>
108#include <linux/proc_fs.h>
109#include <linux/seq_file.h>
110#include <net/scm.h>
111#include <linux/init.h>
112#include <linux/poll.h>
113#include <linux/rtnetlink.h>
114#include <linux/mount.h>
115#include <net/checksum.h>
116#include <linux/security.h>
117#include <linux/freezer.h>
118
119struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
120EXPORT_SYMBOL_GPL(unix_socket_table);
121DEFINE_SPINLOCK(unix_table_lock);
122EXPORT_SYMBOL_GPL(unix_table_lock);
123static atomic_long_t unix_nr_socks;
124
125
126static struct hlist_head *unix_sockets_unbound(void *addr)
127{
128	unsigned long hash = (unsigned long)addr;
129
130	hash ^= hash >> 16;
131	hash ^= hash >> 8;
132	hash %= UNIX_HASH_SIZE;
133	return &unix_socket_table[UNIX_HASH_SIZE + hash];
134}
135
136#define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
137
138#ifdef CONFIG_SECURITY_NETWORK
139static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
140{
141	memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
142}
143
144static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
145{
146	scm->secid = *UNIXSID(skb);
147}
148#else
149static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
150{ }
151
152static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
153{ }
154#endif /* CONFIG_SECURITY_NETWORK */
155
156/*
157 *  SMP locking strategy:
158 *    hash table is protected with spinlock unix_table_lock
159 *    each socket state is protected by separate spin lock.
160 */
161
162static inline unsigned int unix_hash_fold(__wsum n)
163{
164	unsigned int hash = (__force unsigned int)n;
165
166	hash ^= hash>>16;
167	hash ^= hash>>8;
168	return hash&(UNIX_HASH_SIZE-1);
169}
170
171#define unix_peer(sk) (unix_sk(sk)->peer)
172
173static inline int unix_our_peer(struct sock *sk, struct sock *osk)
174{
175	return unix_peer(osk) == sk;
176}
177
178static inline int unix_may_send(struct sock *sk, struct sock *osk)
179{
180	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
181}
182
183static inline int unix_recvq_full(struct sock const *sk)
184{
185	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
186}
187
188struct sock *unix_peer_get(struct sock *s)
189{
190	struct sock *peer;
191
192	unix_state_lock(s);
193	peer = unix_peer(s);
194	if (peer)
195		sock_hold(peer);
196	unix_state_unlock(s);
197	return peer;
198}
199EXPORT_SYMBOL_GPL(unix_peer_get);
200
201static inline void unix_release_addr(struct unix_address *addr)
202{
203	if (atomic_dec_and_test(&addr->refcnt))
204		kfree(addr);
205}
206
207/*
208 *	Check unix socket name:
209 *		- should be not zero length.
210 *	        - if started by not zero, should be NULL terminated (FS object)
211 *		- if started by zero, it is abstract name.
212 */
213
214static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
215{
216	if (len <= sizeof(short) || len > sizeof(*sunaddr))
217		return -EINVAL;
218	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
219		return -EINVAL;
220	if (sunaddr->sun_path[0]) {
221		/*
222		 * This may look like an off by one error but it is a bit more
223		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
224		 * sun_path[108] doesn't as such exist.  However in kernel space
225		 * we are guaranteed that it is a valid memory location in our
226		 * kernel address buffer.
227		 */
228		((char *)sunaddr)[len] = 0;
229		len = strlen(sunaddr->sun_path)+1+sizeof(short);
230		return len;
231	}
232
233	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
234	return len;
235}
236
237static void __unix_remove_socket(struct sock *sk)
238{
239	sk_del_node_init(sk);
240}
241
242static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
243{
244	WARN_ON(!sk_unhashed(sk));
245	sk_add_node(sk, list);
246}
247
248static inline void unix_remove_socket(struct sock *sk)
249{
250	spin_lock(&unix_table_lock);
251	__unix_remove_socket(sk);
252	spin_unlock(&unix_table_lock);
253}
254
255static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
256{
257	spin_lock(&unix_table_lock);
258	__unix_insert_socket(list, sk);
259	spin_unlock(&unix_table_lock);
260}
261
262static struct sock *__unix_find_socket_byname(struct net *net,
263					      struct sockaddr_un *sunname,
264					      int len, int type, unsigned int hash)
265{
266	struct sock *s;
267
268	sk_for_each(s, &unix_socket_table[hash ^ type]) {
269		struct unix_sock *u = unix_sk(s);
270
271		if (!net_eq(sock_net(s), net))
272			continue;
273
274		if (u->addr->len == len &&
275		    !memcmp(u->addr->name, sunname, len))
276			goto found;
277	}
278	s = NULL;
279found:
280	return s;
281}
282
283static inline struct sock *unix_find_socket_byname(struct net *net,
284						   struct sockaddr_un *sunname,
285						   int len, int type,
286						   unsigned int hash)
287{
288	struct sock *s;
289
290	spin_lock(&unix_table_lock);
291	s = __unix_find_socket_byname(net, sunname, len, type, hash);
292	if (s)
293		sock_hold(s);
294	spin_unlock(&unix_table_lock);
295	return s;
296}
297
298static struct sock *unix_find_socket_byinode(struct inode *i)
299{
300	struct sock *s;
301
302	spin_lock(&unix_table_lock);
303	sk_for_each(s,
304		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
305		struct dentry *dentry = unix_sk(s)->path.dentry;
306
307		if (dentry && dentry->d_inode == i) {
308			sock_hold(s);
309			goto found;
310		}
311	}
312	s = NULL;
313found:
314	spin_unlock(&unix_table_lock);
315	return s;
316}
317
318static inline int unix_writable(struct sock *sk)
319{
320	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
321}
322
323static void unix_write_space(struct sock *sk)
324{
325	struct socket_wq *wq;
326
327	rcu_read_lock();
328	if (unix_writable(sk)) {
329		wq = rcu_dereference(sk->sk_wq);
330		if (wq_has_sleeper(wq))
331			wake_up_interruptible_sync_poll(&wq->wait,
332				POLLOUT | POLLWRNORM | POLLWRBAND);
333		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
334	}
335	rcu_read_unlock();
336}
337
338/* When dgram socket disconnects (or changes its peer), we clear its receive
339 * queue of packets arrived from previous peer. First, it allows to do
340 * flow control based only on wmem_alloc; second, sk connected to peer
341 * may receive messages only from that peer. */
342static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
343{
344	if (!skb_queue_empty(&sk->sk_receive_queue)) {
345		skb_queue_purge(&sk->sk_receive_queue);
346		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
347
348		/* If one link of bidirectional dgram pipe is disconnected,
349		 * we signal error. Messages are lost. Do not make this,
350		 * when peer was not connected to us.
351		 */
352		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
353			other->sk_err = ECONNRESET;
354			other->sk_error_report(other);
355		}
356	}
357}
358
359static void unix_sock_destructor(struct sock *sk)
360{
361	struct unix_sock *u = unix_sk(sk);
362
363	skb_queue_purge(&sk->sk_receive_queue);
364
365	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
366	WARN_ON(!sk_unhashed(sk));
367	WARN_ON(sk->sk_socket);
368	if (!sock_flag(sk, SOCK_DEAD)) {
369		printk(KERN_INFO "Attempt to release alive unix socket: %p\n", sk);
370		return;
371	}
372
373	if (u->addr)
374		unix_release_addr(u->addr);
375
376	atomic_long_dec(&unix_nr_socks);
377	local_bh_disable();
378	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
379	local_bh_enable();
380#ifdef UNIX_REFCNT_DEBUG
381	printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
382		atomic_long_read(&unix_nr_socks));
383#endif
384}
385
386static void unix_release_sock(struct sock *sk, int embrion)
387{
388	struct unix_sock *u = unix_sk(sk);
389	struct path path;
390	struct sock *skpair;
391	struct sk_buff *skb;
392	int state;
393
394	unix_remove_socket(sk);
395
396	/* Clear state */
397	unix_state_lock(sk);
398	sock_orphan(sk);
399	sk->sk_shutdown = SHUTDOWN_MASK;
400	path	     = u->path;
401	u->path.dentry = NULL;
402	u->path.mnt = NULL;
403	state = sk->sk_state;
404	sk->sk_state = TCP_CLOSE;
405	unix_state_unlock(sk);
406
407	wake_up_interruptible_all(&u->peer_wait);
408
409	skpair = unix_peer(sk);
410
411	if (skpair != NULL) {
412		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
413			unix_state_lock(skpair);
414			/* No more writes */
415			skpair->sk_shutdown = SHUTDOWN_MASK;
416			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
417				skpair->sk_err = ECONNRESET;
418			unix_state_unlock(skpair);
419			skpair->sk_state_change(skpair);
420			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
421		}
422		sock_put(skpair); /* It may now die */
423		unix_peer(sk) = NULL;
424	}
425
426	/* Try to flush out this socket. Throw out buffers at least */
427
428	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
429		if (state == TCP_LISTEN)
430			unix_release_sock(skb->sk, 1);
431		/* passed fds are erased in the kfree_skb hook	      */
432		kfree_skb(skb);
433	}
434
435	if (path.dentry)
436		path_put(&path);
437
438	sock_put(sk);
439
440	/* ---- Socket is dead now and most probably destroyed ---- */
441
442	/*
443	 * Fixme: BSD difference: In BSD all sockets connected to us get
444	 *	  ECONNRESET and we die on the spot. In Linux we behave
445	 *	  like files and pipes do and wait for the last
446	 *	  dereference.
447	 *
448	 * Can't we simply set sock->err?
449	 *
450	 *	  What the above comment does talk about? --ANK(980817)
451	 */
452
453	if (unix_tot_inflight)
454		unix_gc();		/* Garbage collect fds */
455}
456
457static void init_peercred(struct sock *sk)
458{
459	put_pid(sk->sk_peer_pid);
460	if (sk->sk_peer_cred)
461		put_cred(sk->sk_peer_cred);
462	sk->sk_peer_pid  = get_pid(task_tgid(current));
463	sk->sk_peer_cred = get_current_cred();
464}
465
466static void copy_peercred(struct sock *sk, struct sock *peersk)
467{
468	put_pid(sk->sk_peer_pid);
469	if (sk->sk_peer_cred)
470		put_cred(sk->sk_peer_cred);
471	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
472	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
473}
474
475static int unix_listen(struct socket *sock, int backlog)
476{
477	int err;
478	struct sock *sk = sock->sk;
479	struct unix_sock *u = unix_sk(sk);
480	struct pid *old_pid = NULL;
481
482	err = -EOPNOTSUPP;
483	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
484		goto out;	/* Only stream/seqpacket sockets accept */
485	err = -EINVAL;
486	if (!u->addr)
487		goto out;	/* No listens on an unbound socket */
488	unix_state_lock(sk);
489	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
490		goto out_unlock;
491	if (backlog > sk->sk_max_ack_backlog)
492		wake_up_interruptible_all(&u->peer_wait);
493	sk->sk_max_ack_backlog	= backlog;
494	sk->sk_state		= TCP_LISTEN;
495	/* set credentials so connect can copy them */
496	init_peercred(sk);
497	err = 0;
498
499out_unlock:
500	unix_state_unlock(sk);
501	put_pid(old_pid);
502out:
503	return err;
504}
505
506static int unix_release(struct socket *);
507static int unix_bind(struct socket *, struct sockaddr *, int);
508static int unix_stream_connect(struct socket *, struct sockaddr *,
509			       int addr_len, int flags);
510static int unix_socketpair(struct socket *, struct socket *);
511static int unix_accept(struct socket *, struct socket *, int);
512static int unix_getname(struct socket *, struct sockaddr *, int *, int);
513static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
514static unsigned int unix_dgram_poll(struct file *, struct socket *,
515				    poll_table *);
516static int unix_ioctl(struct socket *, unsigned int, unsigned long);
517static int unix_shutdown(struct socket *, int);
518static int unix_stream_sendmsg(struct kiocb *, struct socket *,
519			       struct msghdr *, size_t);
520static int unix_stream_recvmsg(struct kiocb *, struct socket *,
521			       struct msghdr *, size_t, int);
522static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
523			      struct msghdr *, size_t);
524static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
525			      struct msghdr *, size_t, int);
526static int unix_dgram_connect(struct socket *, struct sockaddr *,
527			      int, int);
528static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
529				  struct msghdr *, size_t);
530static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
531				  struct msghdr *, size_t, int);
532
533static int unix_set_peek_off(struct sock *sk, int val)
534{
535	struct unix_sock *u = unix_sk(sk);
536
537	if (mutex_lock_interruptible(&u->readlock))
538		return -EINTR;
539
540	sk->sk_peek_off = val;
541	mutex_unlock(&u->readlock);
542
543	return 0;
544}
545
546
547static const struct proto_ops unix_stream_ops = {
548	.family =	PF_UNIX,
549	.owner =	THIS_MODULE,
550	.release =	unix_release,
551	.bind =		unix_bind,
552	.connect =	unix_stream_connect,
553	.socketpair =	unix_socketpair,
554	.accept =	unix_accept,
555	.getname =	unix_getname,
556	.poll =		unix_poll,
557	.ioctl =	unix_ioctl,
558	.listen =	unix_listen,
559	.shutdown =	unix_shutdown,
560	.setsockopt =	sock_no_setsockopt,
561	.getsockopt =	sock_no_getsockopt,
562	.sendmsg =	unix_stream_sendmsg,
563	.recvmsg =	unix_stream_recvmsg,
564	.mmap =		sock_no_mmap,
565	.sendpage =	sock_no_sendpage,
566	.set_peek_off =	unix_set_peek_off,
567};
568
569static const struct proto_ops unix_dgram_ops = {
570	.family =	PF_UNIX,
571	.owner =	THIS_MODULE,
572	.release =	unix_release,
573	.bind =		unix_bind,
574	.connect =	unix_dgram_connect,
575	.socketpair =	unix_socketpair,
576	.accept =	sock_no_accept,
577	.getname =	unix_getname,
578	.poll =		unix_dgram_poll,
579	.ioctl =	unix_ioctl,
580	.listen =	sock_no_listen,
581	.shutdown =	unix_shutdown,
582	.setsockopt =	sock_no_setsockopt,
583	.getsockopt =	sock_no_getsockopt,
584	.sendmsg =	unix_dgram_sendmsg,
585	.recvmsg =	unix_dgram_recvmsg,
586	.mmap =		sock_no_mmap,
587	.sendpage =	sock_no_sendpage,
588	.set_peek_off =	unix_set_peek_off,
589};
590
591static const struct proto_ops unix_seqpacket_ops = {
592	.family =	PF_UNIX,
593	.owner =	THIS_MODULE,
594	.release =	unix_release,
595	.bind =		unix_bind,
596	.connect =	unix_stream_connect,
597	.socketpair =	unix_socketpair,
598	.accept =	unix_accept,
599	.getname =	unix_getname,
600	.poll =		unix_dgram_poll,
601	.ioctl =	unix_ioctl,
602	.listen =	unix_listen,
603	.shutdown =	unix_shutdown,
604	.setsockopt =	sock_no_setsockopt,
605	.getsockopt =	sock_no_getsockopt,
606	.sendmsg =	unix_seqpacket_sendmsg,
607	.recvmsg =	unix_seqpacket_recvmsg,
608	.mmap =		sock_no_mmap,
609	.sendpage =	sock_no_sendpage,
610	.set_peek_off =	unix_set_peek_off,
611};
612
613static struct proto unix_proto = {
614	.name			= "UNIX",
615	.owner			= THIS_MODULE,
616	.obj_size		= sizeof(struct unix_sock),
617};
618
619/*
620 * AF_UNIX sockets do not interact with hardware, hence they
621 * dont trigger interrupts - so it's safe for them to have
622 * bh-unsafe locking for their sk_receive_queue.lock. Split off
623 * this special lock-class by reinitializing the spinlock key:
624 */
625static struct lock_class_key af_unix_sk_receive_queue_lock_key;
626
627static struct sock *unix_create1(struct net *net, struct socket *sock)
628{
629	struct sock *sk = NULL;
630	struct unix_sock *u;
631
632	atomic_long_inc(&unix_nr_socks);
633	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
634		goto out;
635
636	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
637	if (!sk)
638		goto out;
639
640	sock_init_data(sock, sk);
641	lockdep_set_class(&sk->sk_receive_queue.lock,
642				&af_unix_sk_receive_queue_lock_key);
643
644	sk->sk_write_space	= unix_write_space;
645	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
646	sk->sk_destruct		= unix_sock_destructor;
647	u	  = unix_sk(sk);
648	u->path.dentry = NULL;
649	u->path.mnt = NULL;
650	spin_lock_init(&u->lock);
651	atomic_long_set(&u->inflight, 0);
652	INIT_LIST_HEAD(&u->link);
653	mutex_init(&u->readlock); /* single task reading lock */
654	init_waitqueue_head(&u->peer_wait);
655	unix_insert_socket(unix_sockets_unbound(sk), sk);
656out:
657	if (sk == NULL)
658		atomic_long_dec(&unix_nr_socks);
659	else {
660		local_bh_disable();
661		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
662		local_bh_enable();
663	}
664	return sk;
665}
666
667static int unix_create(struct net *net, struct socket *sock, int protocol,
668		       int kern)
669{
670	if (protocol && protocol != PF_UNIX)
671		return -EPROTONOSUPPORT;
672
673	sock->state = SS_UNCONNECTED;
674
675	switch (sock->type) {
676	case SOCK_STREAM:
677		sock->ops = &unix_stream_ops;
678		break;
679		/*
680		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
681		 *	nothing uses it.
682		 */
683	case SOCK_RAW:
684		sock->type = SOCK_DGRAM;
685	case SOCK_DGRAM:
686		sock->ops = &unix_dgram_ops;
687		break;
688	case SOCK_SEQPACKET:
689		sock->ops = &unix_seqpacket_ops;
690		break;
691	default:
692		return -ESOCKTNOSUPPORT;
693	}
694
695	return unix_create1(net, sock) ? 0 : -ENOMEM;
696}
697
698static int unix_release(struct socket *sock)
699{
700	struct sock *sk = sock->sk;
701
702	if (!sk)
703		return 0;
704
705	unix_release_sock(sk, 0);
706	sock->sk = NULL;
707
708	return 0;
709}
710
711static int unix_autobind(struct socket *sock)
712{
713	struct sock *sk = sock->sk;
714	struct net *net = sock_net(sk);
715	struct unix_sock *u = unix_sk(sk);
716	static u32 ordernum = 1;
717	struct unix_address *addr;
718	int err;
719	unsigned int retries = 0;
720
721	mutex_lock(&u->readlock);
722
723	err = 0;
724	if (u->addr)
725		goto out;
726
727	err = -ENOMEM;
728	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
729	if (!addr)
730		goto out;
731
732	addr->name->sun_family = AF_UNIX;
733	atomic_set(&addr->refcnt, 1);
734
735retry:
736	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
737	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
738
739	spin_lock(&unix_table_lock);
740	ordernum = (ordernum+1)&0xFFFFF;
741
742	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
743				      addr->hash)) {
744		spin_unlock(&unix_table_lock);
745		/*
746		 * __unix_find_socket_byname() may take long time if many names
747		 * are already in use.
748		 */
749		cond_resched();
750		/* Give up if all names seems to be in use. */
751		if (retries++ == 0xFFFFF) {
752			err = -ENOSPC;
753			kfree(addr);
754			goto out;
755		}
756		goto retry;
757	}
758	addr->hash ^= sk->sk_type;
759
760	__unix_remove_socket(sk);
761	u->addr = addr;
762	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
763	spin_unlock(&unix_table_lock);
764	err = 0;
765
766out:	mutex_unlock(&u->readlock);
767	return err;
768}
769
770static struct sock *unix_find_other(struct net *net,
771				    struct sockaddr_un *sunname, int len,
772				    int type, unsigned int hash, int *error)
773{
774	struct sock *u;
775	struct path path;
776	int err = 0;
777
778	if (sunname->sun_path[0]) {
779		struct inode *inode;
780		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
781		if (err)
782			goto fail;
783		inode = path.dentry->d_inode;
784		err = inode_permission(inode, MAY_WRITE);
785		if (err)
786			goto put_fail;
787
788		err = -ECONNREFUSED;
789		if (!S_ISSOCK(inode->i_mode))
790			goto put_fail;
791		u = unix_find_socket_byinode(inode);
792		if (!u)
793			goto put_fail;
794
795		if (u->sk_type == type)
796			touch_atime(&path);
797
798		path_put(&path);
799
800		err = -EPROTOTYPE;
801		if (u->sk_type != type) {
802			sock_put(u);
803			goto fail;
804		}
805	} else {
806		err = -ECONNREFUSED;
807		u = unix_find_socket_byname(net, sunname, len, type, hash);
808		if (u) {
809			struct dentry *dentry;
810			dentry = unix_sk(u)->path.dentry;
811			if (dentry)
812				touch_atime(&unix_sk(u)->path);
813		} else
814			goto fail;
815	}
816	return u;
817
818put_fail:
819	path_put(&path);
820fail:
821	*error = err;
822	return NULL;
823}
824
825static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
826{
827	struct dentry *dentry;
828	struct path path;
829	int err = 0;
830	/*
831	 * Get the parent directory, calculate the hash for last
832	 * component.
833	 */
834	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
835	err = PTR_ERR(dentry);
836	if (IS_ERR(dentry))
837		return err;
838
839	/*
840	 * All right, let's create it.
841	 */
842	err = security_path_mknod(&path, dentry, mode, 0);
843	if (!err) {
844		err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
845		if (!err) {
846			res->mnt = mntget(path.mnt);
847			res->dentry = dget(dentry);
848		}
849	}
850	done_path_create(&path, dentry);
851	return err;
852}
853
854static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
855{
856	struct sock *sk = sock->sk;
857	struct net *net = sock_net(sk);
858	struct unix_sock *u = unix_sk(sk);
859	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
860	char *sun_path = sunaddr->sun_path;
861	int err;
862	unsigned int hash;
863	struct unix_address *addr;
864	struct hlist_head *list;
865
866	err = -EINVAL;
867	if (sunaddr->sun_family != AF_UNIX)
868		goto out;
869
870	if (addr_len == sizeof(short)) {
871		err = unix_autobind(sock);
872		goto out;
873	}
874
875	err = unix_mkname(sunaddr, addr_len, &hash);
876	if (err < 0)
877		goto out;
878	addr_len = err;
879
880	mutex_lock(&u->readlock);
881
882	err = -EINVAL;
883	if (u->addr)
884		goto out_up;
885
886	err = -ENOMEM;
887	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
888	if (!addr)
889		goto out_up;
890
891	memcpy(addr->name, sunaddr, addr_len);
892	addr->len = addr_len;
893	addr->hash = hash ^ sk->sk_type;
894	atomic_set(&addr->refcnt, 1);
895
896	if (sun_path[0]) {
897		struct path path;
898		umode_t mode = S_IFSOCK |
899		       (SOCK_INODE(sock)->i_mode & ~current_umask());
900		err = unix_mknod(sun_path, mode, &path);
901		if (err) {
902			if (err == -EEXIST)
903				err = -EADDRINUSE;
904			unix_release_addr(addr);
905			goto out_up;
906		}
907		addr->hash = UNIX_HASH_SIZE;
908		hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
909		spin_lock(&unix_table_lock);
910		u->path = path;
911		list = &unix_socket_table[hash];
912	} else {
913		spin_lock(&unix_table_lock);
914		err = -EADDRINUSE;
915		if (__unix_find_socket_byname(net, sunaddr, addr_len,
916					      sk->sk_type, hash)) {
917			unix_release_addr(addr);
918			goto out_unlock;
919		}
920
921		list = &unix_socket_table[addr->hash];
922	}
923
924	err = 0;
925	__unix_remove_socket(sk);
926	u->addr = addr;
927	__unix_insert_socket(list, sk);
928
929out_unlock:
930	spin_unlock(&unix_table_lock);
931out_up:
932	mutex_unlock(&u->readlock);
933out:
934	return err;
935}
936
937static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
938{
939	if (unlikely(sk1 == sk2) || !sk2) {
940		unix_state_lock(sk1);
941		return;
942	}
943	if (sk1 < sk2) {
944		unix_state_lock(sk1);
945		unix_state_lock_nested(sk2);
946	} else {
947		unix_state_lock(sk2);
948		unix_state_lock_nested(sk1);
949	}
950}
951
952static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
953{
954	if (unlikely(sk1 == sk2) || !sk2) {
955		unix_state_unlock(sk1);
956		return;
957	}
958	unix_state_unlock(sk1);
959	unix_state_unlock(sk2);
960}
961
962static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
963			      int alen, int flags)
964{
965	struct sock *sk = sock->sk;
966	struct net *net = sock_net(sk);
967	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
968	struct sock *other;
969	unsigned int hash;
970	int err;
971
972	if (addr->sa_family != AF_UNSPEC) {
973		err = unix_mkname(sunaddr, alen, &hash);
974		if (err < 0)
975			goto out;
976		alen = err;
977
978		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
979		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
980			goto out;
981
982restart:
983		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
984		if (!other)
985			goto out;
986
987		unix_state_double_lock(sk, other);
988
989		/* Apparently VFS overslept socket death. Retry. */
990		if (sock_flag(other, SOCK_DEAD)) {
991			unix_state_double_unlock(sk, other);
992			sock_put(other);
993			goto restart;
994		}
995
996		err = -EPERM;
997		if (!unix_may_send(sk, other))
998			goto out_unlock;
999
1000		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1001		if (err)
1002			goto out_unlock;
1003
1004	} else {
1005		/*
1006		 *	1003.1g breaking connected state with AF_UNSPEC
1007		 */
1008		other = NULL;
1009		unix_state_double_lock(sk, other);
1010	}
1011
1012	/*
1013	 * If it was connected, reconnect.
1014	 */
1015	if (unix_peer(sk)) {
1016		struct sock *old_peer = unix_peer(sk);
1017		unix_peer(sk) = other;
1018		unix_state_double_unlock(sk, other);
1019
1020		if (other != old_peer)
1021			unix_dgram_disconnected(sk, old_peer);
1022		sock_put(old_peer);
1023	} else {
1024		unix_peer(sk) = other;
1025		unix_state_double_unlock(sk, other);
1026	}
1027	return 0;
1028
1029out_unlock:
1030	unix_state_double_unlock(sk, other);
1031	sock_put(other);
1032out:
1033	return err;
1034}
1035
1036static long unix_wait_for_peer(struct sock *other, long timeo)
1037{
1038	struct unix_sock *u = unix_sk(other);
1039	int sched;
1040	DEFINE_WAIT(wait);
1041
1042	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1043
1044	sched = !sock_flag(other, SOCK_DEAD) &&
1045		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1046		unix_recvq_full(other);
1047
1048	unix_state_unlock(other);
1049
1050	if (sched)
1051		timeo = schedule_timeout(timeo);
1052
1053	finish_wait(&u->peer_wait, &wait);
1054	return timeo;
1055}
1056
1057static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1058			       int addr_len, int flags)
1059{
1060	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1061	struct sock *sk = sock->sk;
1062	struct net *net = sock_net(sk);
1063	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1064	struct sock *newsk = NULL;
1065	struct sock *other = NULL;
1066	struct sk_buff *skb = NULL;
1067	unsigned int hash;
1068	int st;
1069	int err;
1070	long timeo;
1071
1072	err = unix_mkname(sunaddr, addr_len, &hash);
1073	if (err < 0)
1074		goto out;
1075	addr_len = err;
1076
1077	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1078	    (err = unix_autobind(sock)) != 0)
1079		goto out;
1080
1081	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1082
1083	/* First of all allocate resources.
1084	   If we will make it after state is locked,
1085	   we will have to recheck all again in any case.
1086	 */
1087
1088	err = -ENOMEM;
1089
1090	/* create new sock for complete connection */
1091	newsk = unix_create1(sock_net(sk), NULL);
1092	if (newsk == NULL)
1093		goto out;
1094
1095	/* Allocate skb for sending to listening sock */
1096	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1097	if (skb == NULL)
1098		goto out;
1099
1100restart:
1101	/*  Find listening sock. */
1102	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1103	if (!other)
1104		goto out;
1105
1106	/* Latch state of peer */
1107	unix_state_lock(other);
1108
1109	/* Apparently VFS overslept socket death. Retry. */
1110	if (sock_flag(other, SOCK_DEAD)) {
1111		unix_state_unlock(other);
1112		sock_put(other);
1113		goto restart;
1114	}
1115
1116	err = -ECONNREFUSED;
1117	if (other->sk_state != TCP_LISTEN)
1118		goto out_unlock;
1119	if (other->sk_shutdown & RCV_SHUTDOWN)
1120		goto out_unlock;
1121
1122	if (unix_recvq_full(other)) {
1123		err = -EAGAIN;
1124		if (!timeo)
1125			goto out_unlock;
1126
1127		timeo = unix_wait_for_peer(other, timeo);
1128
1129		err = sock_intr_errno(timeo);
1130		if (signal_pending(current))
1131			goto out;
1132		sock_put(other);
1133		goto restart;
1134	}
1135
1136	/* Latch our state.
1137
1138	   It is tricky place. We need to grab our state lock and cannot
1139	   drop lock on peer. It is dangerous because deadlock is
1140	   possible. Connect to self case and simultaneous
1141	   attempt to connect are eliminated by checking socket
1142	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1143	   check this before attempt to grab lock.
1144
1145	   Well, and we have to recheck the state after socket locked.
1146	 */
1147	st = sk->sk_state;
1148
1149	switch (st) {
1150	case TCP_CLOSE:
1151		/* This is ok... continue with connect */
1152		break;
1153	case TCP_ESTABLISHED:
1154		/* Socket is already connected */
1155		err = -EISCONN;
1156		goto out_unlock;
1157	default:
1158		err = -EINVAL;
1159		goto out_unlock;
1160	}
1161
1162	unix_state_lock_nested(sk);
1163
1164	if (sk->sk_state != st) {
1165		unix_state_unlock(sk);
1166		unix_state_unlock(other);
1167		sock_put(other);
1168		goto restart;
1169	}
1170
1171	err = security_unix_stream_connect(sk, other, newsk);
1172	if (err) {
1173		unix_state_unlock(sk);
1174		goto out_unlock;
1175	}
1176
1177	/* The way is open! Fastly set all the necessary fields... */
1178
1179	sock_hold(sk);
1180	unix_peer(newsk)	= sk;
1181	newsk->sk_state		= TCP_ESTABLISHED;
1182	newsk->sk_type		= sk->sk_type;
1183	init_peercred(newsk);
1184	newu = unix_sk(newsk);
1185	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1186	otheru = unix_sk(other);
1187
1188	/* copy address information from listening to new sock*/
1189	if (otheru->addr) {
1190		atomic_inc(&otheru->addr->refcnt);
1191		newu->addr = otheru->addr;
1192	}
1193	if (otheru->path.dentry) {
1194		path_get(&otheru->path);
1195		newu->path = otheru->path;
1196	}
1197
1198	/* Set credentials */
1199	copy_peercred(sk, other);
1200
1201	sock->state	= SS_CONNECTED;
1202	sk->sk_state	= TCP_ESTABLISHED;
1203	sock_hold(newsk);
1204
1205	smp_mb__after_atomic_inc();	/* sock_hold() does an atomic_inc() */
1206	unix_peer(sk)	= newsk;
1207
1208	unix_state_unlock(sk);
1209
1210	/* take ten and and send info to listening sock */
1211	spin_lock(&other->sk_receive_queue.lock);
1212	__skb_queue_tail(&other->sk_receive_queue, skb);
1213	spin_unlock(&other->sk_receive_queue.lock);
1214	unix_state_unlock(other);
1215	other->sk_data_ready(other, 0);
1216	sock_put(other);
1217	return 0;
1218
1219out_unlock:
1220	if (other)
1221		unix_state_unlock(other);
1222
1223out:
1224	kfree_skb(skb);
1225	if (newsk)
1226		unix_release_sock(newsk, 0);
1227	if (other)
1228		sock_put(other);
1229	return err;
1230}
1231
1232static int unix_socketpair(struct socket *socka, struct socket *sockb)
1233{
1234	struct sock *ska = socka->sk, *skb = sockb->sk;
1235
1236	/* Join our sockets back to back */
1237	sock_hold(ska);
1238	sock_hold(skb);
1239	unix_peer(ska) = skb;
1240	unix_peer(skb) = ska;
1241	init_peercred(ska);
1242	init_peercred(skb);
1243
1244	if (ska->sk_type != SOCK_DGRAM) {
1245		ska->sk_state = TCP_ESTABLISHED;
1246		skb->sk_state = TCP_ESTABLISHED;
1247		socka->state  = SS_CONNECTED;
1248		sockb->state  = SS_CONNECTED;
1249	}
1250	return 0;
1251}
1252
1253static void unix_sock_inherit_flags(const struct socket *old,
1254				    struct socket *new)
1255{
1256	if (test_bit(SOCK_PASSCRED, &old->flags))
1257		set_bit(SOCK_PASSCRED, &new->flags);
1258	if (test_bit(SOCK_PASSSEC, &old->flags))
1259		set_bit(SOCK_PASSSEC, &new->flags);
1260}
1261
1262static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1263{
1264	struct sock *sk = sock->sk;
1265	struct sock *tsk;
1266	struct sk_buff *skb;
1267	int err;
1268
1269	err = -EOPNOTSUPP;
1270	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1271		goto out;
1272
1273	err = -EINVAL;
1274	if (sk->sk_state != TCP_LISTEN)
1275		goto out;
1276
1277	/* If socket state is TCP_LISTEN it cannot change (for now...),
1278	 * so that no locks are necessary.
1279	 */
1280
1281	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1282	if (!skb) {
1283		/* This means receive shutdown. */
1284		if (err == 0)
1285			err = -EINVAL;
1286		goto out;
1287	}
1288
1289	tsk = skb->sk;
1290	skb_free_datagram(sk, skb);
1291	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1292
1293	/* attach accepted sock to socket */
1294	unix_state_lock(tsk);
1295	newsock->state = SS_CONNECTED;
1296	unix_sock_inherit_flags(sock, newsock);
1297	sock_graft(tsk, newsock);
1298	unix_state_unlock(tsk);
1299	return 0;
1300
1301out:
1302	return err;
1303}
1304
1305
1306static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1307{
1308	struct sock *sk = sock->sk;
1309	struct unix_sock *u;
1310	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1311	int err = 0;
1312
1313	if (peer) {
1314		sk = unix_peer_get(sk);
1315
1316		err = -ENOTCONN;
1317		if (!sk)
1318			goto out;
1319		err = 0;
1320	} else {
1321		sock_hold(sk);
1322	}
1323
1324	u = unix_sk(sk);
1325	unix_state_lock(sk);
1326	if (!u->addr) {
1327		sunaddr->sun_family = AF_UNIX;
1328		sunaddr->sun_path[0] = 0;
1329		*uaddr_len = sizeof(short);
1330	} else {
1331		struct unix_address *addr = u->addr;
1332
1333		*uaddr_len = addr->len;
1334		memcpy(sunaddr, addr->name, *uaddr_len);
1335	}
1336	unix_state_unlock(sk);
1337	sock_put(sk);
1338out:
1339	return err;
1340}
1341
1342static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1343{
1344	int i;
1345
1346	scm->fp = UNIXCB(skb).fp;
1347	UNIXCB(skb).fp = NULL;
1348
1349	for (i = scm->fp->count-1; i >= 0; i--)
1350		unix_notinflight(scm->fp->fp[i]);
1351}
1352
1353static void unix_destruct_scm(struct sk_buff *skb)
1354{
1355	struct scm_cookie scm;
1356	memset(&scm, 0, sizeof(scm));
1357	scm.pid  = UNIXCB(skb).pid;
1358	if (UNIXCB(skb).fp)
1359		unix_detach_fds(&scm, skb);
1360
1361	/* Alas, it calls VFS */
1362	/* So fscking what? fput() had been SMP-safe since the last Summer */
1363	scm_destroy(&scm);
1364	sock_wfree(skb);
1365}
1366
1367#define MAX_RECURSION_LEVEL 4
1368
1369static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1370{
1371	int i;
1372	unsigned char max_level = 0;
1373	int unix_sock_count = 0;
1374
1375	for (i = scm->fp->count - 1; i >= 0; i--) {
1376		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1377
1378		if (sk) {
1379			unix_sock_count++;
1380			max_level = max(max_level,
1381					unix_sk(sk)->recursion_level);
1382		}
1383	}
1384	if (unlikely(max_level > MAX_RECURSION_LEVEL))
1385		return -ETOOMANYREFS;
1386
1387	/*
1388	 * Need to duplicate file references for the sake of garbage
1389	 * collection.  Otherwise a socket in the fps might become a
1390	 * candidate for GC while the skb is not yet queued.
1391	 */
1392	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1393	if (!UNIXCB(skb).fp)
1394		return -ENOMEM;
1395
1396	if (unix_sock_count) {
1397		for (i = scm->fp->count - 1; i >= 0; i--)
1398			unix_inflight(scm->fp->fp[i]);
1399	}
1400	return max_level;
1401}
1402
1403static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1404{
1405	int err = 0;
1406
1407	UNIXCB(skb).pid  = get_pid(scm->pid);
1408	UNIXCB(skb).uid = scm->creds.uid;
1409	UNIXCB(skb).gid = scm->creds.gid;
1410	UNIXCB(skb).fp = NULL;
1411	if (scm->fp && send_fds)
1412		err = unix_attach_fds(scm, skb);
1413
1414	skb->destructor = unix_destruct_scm;
1415	return err;
1416}
1417
1418/*
1419 * Some apps rely on write() giving SCM_CREDENTIALS
1420 * We include credentials if source or destination socket
1421 * asserted SOCK_PASSCRED.
1422 */
1423static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1424			    const struct sock *other)
1425{
1426	if (UNIXCB(skb).pid)
1427		return;
1428	if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1429	    !other->sk_socket ||
1430	    test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1431		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1432		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1433	}
1434}
1435
1436/*
1437 *	Send AF_UNIX data.
1438 */
1439
1440static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1441			      struct msghdr *msg, size_t len)
1442{
1443	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1444	struct sock *sk = sock->sk;
1445	struct net *net = sock_net(sk);
1446	struct unix_sock *u = unix_sk(sk);
1447	struct sockaddr_un *sunaddr = msg->msg_name;
1448	struct sock *other = NULL;
1449	int namelen = 0; /* fake GCC */
1450	int err;
1451	unsigned int hash;
1452	struct sk_buff *skb;
1453	long timeo;
1454	struct scm_cookie tmp_scm;
1455	int max_level;
1456	int data_len = 0;
1457
1458	if (NULL == siocb->scm)
1459		siocb->scm = &tmp_scm;
1460	wait_for_unix_gc();
1461	err = scm_send(sock, msg, siocb->scm, false);
1462	if (err < 0)
1463		return err;
1464
1465	err = -EOPNOTSUPP;
1466	if (msg->msg_flags&MSG_OOB)
1467		goto out;
1468
1469	if (msg->msg_namelen) {
1470		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1471		if (err < 0)
1472			goto out;
1473		namelen = err;
1474	} else {
1475		sunaddr = NULL;
1476		err = -ENOTCONN;
1477		other = unix_peer_get(sk);
1478		if (!other)
1479			goto out;
1480	}
1481
1482	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1483	    && (err = unix_autobind(sock)) != 0)
1484		goto out;
1485
1486	err = -EMSGSIZE;
1487	if (len > sk->sk_sndbuf - 32)
1488		goto out;
1489
1490	if (len > SKB_MAX_ALLOC)
1491		data_len = min_t(size_t,
1492				 len - SKB_MAX_ALLOC,
1493				 MAX_SKB_FRAGS * PAGE_SIZE);
1494
1495	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1496				   msg->msg_flags & MSG_DONTWAIT, &err,
1497				   PAGE_ALLOC_COSTLY_ORDER);
1498	if (skb == NULL)
1499		goto out;
1500
1501	err = unix_scm_to_skb(siocb->scm, skb, true);
1502	if (err < 0)
1503		goto out_free;
1504	max_level = err + 1;
1505	unix_get_secdata(siocb->scm, skb);
1506
1507	skb_put(skb, len - data_len);
1508	skb->data_len = data_len;
1509	skb->len = len;
1510	err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, len);
1511	if (err)
1512		goto out_free;
1513
1514	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1515
1516restart:
1517	if (!other) {
1518		err = -ECONNRESET;
1519		if (sunaddr == NULL)
1520			goto out_free;
1521
1522		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1523					hash, &err);
1524		if (other == NULL)
1525			goto out_free;
1526	}
1527
1528	if (sk_filter(other, skb) < 0) {
1529		/* Toss the packet but do not return any error to the sender */
1530		err = len;
1531		goto out_free;
1532	}
1533
1534	unix_state_lock(other);
1535	err = -EPERM;
1536	if (!unix_may_send(sk, other))
1537		goto out_unlock;
1538
1539	if (sock_flag(other, SOCK_DEAD)) {
1540		/*
1541		 *	Check with 1003.1g - what should
1542		 *	datagram error
1543		 */
1544		unix_state_unlock(other);
1545		sock_put(other);
1546
1547		err = 0;
1548		unix_state_lock(sk);
1549		if (unix_peer(sk) == other) {
1550			unix_peer(sk) = NULL;
1551			unix_state_unlock(sk);
1552
1553			unix_dgram_disconnected(sk, other);
1554			sock_put(other);
1555			err = -ECONNREFUSED;
1556		} else {
1557			unix_state_unlock(sk);
1558		}
1559
1560		other = NULL;
1561		if (err)
1562			goto out_free;
1563		goto restart;
1564	}
1565
1566	err = -EPIPE;
1567	if (other->sk_shutdown & RCV_SHUTDOWN)
1568		goto out_unlock;
1569
1570	if (sk->sk_type != SOCK_SEQPACKET) {
1571		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1572		if (err)
1573			goto out_unlock;
1574	}
1575
1576	if (unix_peer(other) != sk && unix_recvq_full(other)) {
1577		if (!timeo) {
1578			err = -EAGAIN;
1579			goto out_unlock;
1580		}
1581
1582		timeo = unix_wait_for_peer(other, timeo);
1583
1584		err = sock_intr_errno(timeo);
1585		if (signal_pending(current))
1586			goto out_free;
1587
1588		goto restart;
1589	}
1590
1591	if (sock_flag(other, SOCK_RCVTSTAMP))
1592		__net_timestamp(skb);
1593	maybe_add_creds(skb, sock, other);
1594	skb_queue_tail(&other->sk_receive_queue, skb);
1595	if (max_level > unix_sk(other)->recursion_level)
1596		unix_sk(other)->recursion_level = max_level;
1597	unix_state_unlock(other);
1598	other->sk_data_ready(other, len);
1599	sock_put(other);
1600	scm_destroy(siocb->scm);
1601	return len;
1602
1603out_unlock:
1604	unix_state_unlock(other);
1605out_free:
1606	kfree_skb(skb);
1607out:
1608	if (other)
1609		sock_put(other);
1610	scm_destroy(siocb->scm);
1611	return err;
1612}
1613
1614/* We use paged skbs for stream sockets, and limit occupancy to 32768
1615 * bytes, and a minimun of a full page.
1616 */
1617#define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1618
1619static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1620			       struct msghdr *msg, size_t len)
1621{
1622	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1623	struct sock *sk = sock->sk;
1624	struct sock *other = NULL;
1625	int err, size;
1626	struct sk_buff *skb;
1627	int sent = 0;
1628	struct scm_cookie tmp_scm;
1629	bool fds_sent = false;
1630	int max_level;
1631	int data_len;
1632
1633	if (NULL == siocb->scm)
1634		siocb->scm = &tmp_scm;
1635	wait_for_unix_gc();
1636	err = scm_send(sock, msg, siocb->scm, false);
1637	if (err < 0)
1638		return err;
1639
1640	err = -EOPNOTSUPP;
1641	if (msg->msg_flags&MSG_OOB)
1642		goto out_err;
1643
1644	if (msg->msg_namelen) {
1645		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1646		goto out_err;
1647	} else {
1648		err = -ENOTCONN;
1649		other = unix_peer(sk);
1650		if (!other)
1651			goto out_err;
1652	}
1653
1654	if (sk->sk_shutdown & SEND_SHUTDOWN)
1655		goto pipe_err;
1656
1657	while (sent < len) {
1658		size = len - sent;
1659
1660		/* Keep two messages in the pipe so it schedules better */
1661		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1662
1663		/* allow fallback to order-0 allocations */
1664		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1665
1666		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1667
1668		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1669					   msg->msg_flags & MSG_DONTWAIT, &err,
1670					   get_order(UNIX_SKB_FRAGS_SZ));
1671		if (!skb)
1672			goto out_err;
1673
1674		/* Only send the fds in the first buffer */
1675		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
1676		if (err < 0) {
1677			kfree_skb(skb);
1678			goto out_err;
1679		}
1680		max_level = err + 1;
1681		fds_sent = true;
1682
1683		skb_put(skb, size - data_len);
1684		skb->data_len = data_len;
1685		skb->len = size;
1686		err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov,
1687						   sent, size);
1688		if (err) {
1689			kfree_skb(skb);
1690			goto out_err;
1691		}
1692
1693		unix_state_lock(other);
1694
1695		if (sock_flag(other, SOCK_DEAD) ||
1696		    (other->sk_shutdown & RCV_SHUTDOWN))
1697			goto pipe_err_free;
1698
1699		maybe_add_creds(skb, sock, other);
1700		skb_queue_tail(&other->sk_receive_queue, skb);
1701		if (max_level > unix_sk(other)->recursion_level)
1702			unix_sk(other)->recursion_level = max_level;
1703		unix_state_unlock(other);
1704		other->sk_data_ready(other, size);
1705		sent += size;
1706	}
1707
1708	scm_destroy(siocb->scm);
1709	siocb->scm = NULL;
1710
1711	return sent;
1712
1713pipe_err_free:
1714	unix_state_unlock(other);
1715	kfree_skb(skb);
1716pipe_err:
1717	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1718		send_sig(SIGPIPE, current, 0);
1719	err = -EPIPE;
1720out_err:
1721	scm_destroy(siocb->scm);
1722	siocb->scm = NULL;
1723	return sent ? : err;
1724}
1725
1726static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1727				  struct msghdr *msg, size_t len)
1728{
1729	int err;
1730	struct sock *sk = sock->sk;
1731
1732	err = sock_error(sk);
1733	if (err)
1734		return err;
1735
1736	if (sk->sk_state != TCP_ESTABLISHED)
1737		return -ENOTCONN;
1738
1739	if (msg->msg_namelen)
1740		msg->msg_namelen = 0;
1741
1742	return unix_dgram_sendmsg(kiocb, sock, msg, len);
1743}
1744
1745static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
1746			      struct msghdr *msg, size_t size,
1747			      int flags)
1748{
1749	struct sock *sk = sock->sk;
1750
1751	if (sk->sk_state != TCP_ESTABLISHED)
1752		return -ENOTCONN;
1753
1754	return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
1755}
1756
1757static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1758{
1759	struct unix_sock *u = unix_sk(sk);
1760
1761	if (u->addr) {
1762		msg->msg_namelen = u->addr->len;
1763		memcpy(msg->msg_name, u->addr->name, u->addr->len);
1764	}
1765}
1766
1767static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1768			      struct msghdr *msg, size_t size,
1769			      int flags)
1770{
1771	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1772	struct scm_cookie tmp_scm;
1773	struct sock *sk = sock->sk;
1774	struct unix_sock *u = unix_sk(sk);
1775	int noblock = flags & MSG_DONTWAIT;
1776	struct sk_buff *skb;
1777	int err;
1778	int peeked, skip;
1779
1780	err = -EOPNOTSUPP;
1781	if (flags&MSG_OOB)
1782		goto out;
1783
1784	err = mutex_lock_interruptible(&u->readlock);
1785	if (err) {
1786		err = sock_intr_errno(sock_rcvtimeo(sk, noblock));
1787		goto out;
1788	}
1789
1790	skip = sk_peek_offset(sk, flags);
1791
1792	skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1793	if (!skb) {
1794		unix_state_lock(sk);
1795		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1796		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1797		    (sk->sk_shutdown & RCV_SHUTDOWN))
1798			err = 0;
1799		unix_state_unlock(sk);
1800		goto out_unlock;
1801	}
1802
1803	wake_up_interruptible_sync_poll(&u->peer_wait,
1804					POLLOUT | POLLWRNORM | POLLWRBAND);
1805
1806	if (msg->msg_name)
1807		unix_copy_addr(msg, skb->sk);
1808
1809	if (size > skb->len - skip)
1810		size = skb->len - skip;
1811	else if (size < skb->len - skip)
1812		msg->msg_flags |= MSG_TRUNC;
1813
1814	err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
1815	if (err)
1816		goto out_free;
1817
1818	if (sock_flag(sk, SOCK_RCVTSTAMP))
1819		__sock_recv_timestamp(msg, sk, skb);
1820
1821	if (!siocb->scm) {
1822		siocb->scm = &tmp_scm;
1823		memset(&tmp_scm, 0, sizeof(tmp_scm));
1824	}
1825	scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1826	unix_set_secdata(siocb->scm, skb);
1827
1828	if (!(flags & MSG_PEEK)) {
1829		if (UNIXCB(skb).fp)
1830			unix_detach_fds(siocb->scm, skb);
1831
1832		sk_peek_offset_bwd(sk, skb->len);
1833	} else {
1834		/* It is questionable: on PEEK we could:
1835		   - do not return fds - good, but too simple 8)
1836		   - return fds, and do not return them on read (old strategy,
1837		     apparently wrong)
1838		   - clone fds (I chose it for now, it is the most universal
1839		     solution)
1840
1841		   POSIX 1003.1g does not actually define this clearly
1842		   at all. POSIX 1003.1g doesn't define a lot of things
1843		   clearly however!
1844
1845		*/
1846
1847		sk_peek_offset_fwd(sk, size);
1848
1849		if (UNIXCB(skb).fp)
1850			siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1851	}
1852	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1853
1854	scm_recv(sock, msg, siocb->scm, flags);
1855
1856out_free:
1857	skb_free_datagram(sk, skb);
1858out_unlock:
1859	mutex_unlock(&u->readlock);
1860out:
1861	return err;
1862}
1863
1864/*
1865 *	Sleep until more data has arrived. But check for races..
1866 */
1867static long unix_stream_data_wait(struct sock *sk, long timeo,
1868				  struct sk_buff *last)
1869{
1870	DEFINE_WAIT(wait);
1871
1872	unix_state_lock(sk);
1873
1874	for (;;) {
1875		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1876
1877		if (skb_peek_tail(&sk->sk_receive_queue) != last ||
1878		    sk->sk_err ||
1879		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1880		    signal_pending(current) ||
1881		    !timeo)
1882			break;
1883
1884		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1885		unix_state_unlock(sk);
1886		timeo = freezable_schedule_timeout(timeo);
1887		unix_state_lock(sk);
1888		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1889	}
1890
1891	finish_wait(sk_sleep(sk), &wait);
1892	unix_state_unlock(sk);
1893	return timeo;
1894}
1895
1896static unsigned int unix_skb_len(const struct sk_buff *skb)
1897{
1898	return skb->len - UNIXCB(skb).consumed;
1899}
1900
1901static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1902			       struct msghdr *msg, size_t size,
1903			       int flags)
1904{
1905	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1906	struct scm_cookie tmp_scm;
1907	struct sock *sk = sock->sk;
1908	struct unix_sock *u = unix_sk(sk);
1909	struct sockaddr_un *sunaddr = msg->msg_name;
1910	int copied = 0;
1911	int check_creds = 0;
1912	int target;
1913	int err = 0;
1914	long timeo;
1915	int skip;
1916
1917	err = -EINVAL;
1918	if (sk->sk_state != TCP_ESTABLISHED)
1919		goto out;
1920
1921	err = -EOPNOTSUPP;
1922	if (flags&MSG_OOB)
1923		goto out;
1924
1925	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1926	timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1927
1928	/* Lock the socket to prevent queue disordering
1929	 * while sleeps in memcpy_tomsg
1930	 */
1931
1932	if (!siocb->scm) {
1933		siocb->scm = &tmp_scm;
1934		memset(&tmp_scm, 0, sizeof(tmp_scm));
1935	}
1936
1937	err = mutex_lock_interruptible(&u->readlock);
1938	if (err) {
1939		err = sock_intr_errno(timeo);
1940		goto out;
1941	}
1942
1943	do {
1944		int chunk;
1945		struct sk_buff *skb, *last;
1946
1947		unix_state_lock(sk);
1948		last = skb = skb_peek(&sk->sk_receive_queue);
1949again:
1950		if (skb == NULL) {
1951			unix_sk(sk)->recursion_level = 0;
1952			if (copied >= target)
1953				goto unlock;
1954
1955			/*
1956			 *	POSIX 1003.1g mandates this order.
1957			 */
1958
1959			err = sock_error(sk);
1960			if (err)
1961				goto unlock;
1962			if (sk->sk_shutdown & RCV_SHUTDOWN)
1963				goto unlock;
1964
1965			unix_state_unlock(sk);
1966			err = -EAGAIN;
1967			if (!timeo)
1968				break;
1969			mutex_unlock(&u->readlock);
1970
1971			timeo = unix_stream_data_wait(sk, timeo, last);
1972
1973			if (signal_pending(current)
1974			    ||  mutex_lock_interruptible(&u->readlock)) {
1975				err = sock_intr_errno(timeo);
1976				goto out;
1977			}
1978
1979			continue;
1980 unlock:
1981			unix_state_unlock(sk);
1982			break;
1983		}
1984
1985		skip = sk_peek_offset(sk, flags);
1986		while (skip >= unix_skb_len(skb)) {
1987			skip -= unix_skb_len(skb);
1988			last = skb;
1989			skb = skb_peek_next(skb, &sk->sk_receive_queue);
1990			if (!skb)
1991				goto again;
1992		}
1993
1994		unix_state_unlock(sk);
1995
1996		if (check_creds) {
1997			/* Never glue messages from different writers */
1998			if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
1999			    !uid_eq(UNIXCB(skb).uid, siocb->scm->creds.uid) ||
2000			    !gid_eq(UNIXCB(skb).gid, siocb->scm->creds.gid))
2001				break;
2002		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2003			/* Copy credentials */
2004			scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2005			check_creds = 1;
2006		}
2007
2008		/* Copy address just once */
2009		if (sunaddr) {
2010			unix_copy_addr(msg, skb->sk);
2011			sunaddr = NULL;
2012		}
2013
2014		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2015		if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip,
2016					    msg->msg_iov, chunk)) {
2017			if (copied == 0)
2018				copied = -EFAULT;
2019			break;
2020		}
2021		copied += chunk;
2022		size -= chunk;
2023
2024		/* Mark read part of skb as used */
2025		if (!(flags & MSG_PEEK)) {
2026			UNIXCB(skb).consumed += chunk;
2027
2028			sk_peek_offset_bwd(sk, chunk);
2029
2030			if (UNIXCB(skb).fp)
2031				unix_detach_fds(siocb->scm, skb);
2032
2033			if (unix_skb_len(skb))
2034				break;
2035
2036			skb_unlink(skb, &sk->sk_receive_queue);
2037			consume_skb(skb);
2038
2039			if (siocb->scm->fp)
2040				break;
2041		} else {
2042			/* It is questionable, see note in unix_dgram_recvmsg.
2043			 */
2044			if (UNIXCB(skb).fp)
2045				siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2046
2047			sk_peek_offset_fwd(sk, chunk);
2048
2049			break;
2050		}
2051	} while (size);
2052
2053	mutex_unlock(&u->readlock);
2054	scm_recv(sock, msg, siocb->scm, flags);
2055out:
2056	return copied ? : err;
2057}
2058
2059static int unix_shutdown(struct socket *sock, int mode)
2060{
2061	struct sock *sk = sock->sk;
2062	struct sock *other;
2063
2064	if (mode < SHUT_RD || mode > SHUT_RDWR)
2065		return -EINVAL;
2066	/* This maps:
2067	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2068	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2069	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2070	 */
2071	++mode;
2072
2073	unix_state_lock(sk);
2074	sk->sk_shutdown |= mode;
2075	other = unix_peer(sk);
2076	if (other)
2077		sock_hold(other);
2078	unix_state_unlock(sk);
2079	sk->sk_state_change(sk);
2080
2081	if (other &&
2082		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2083
2084		int peer_mode = 0;
2085
2086		if (mode&RCV_SHUTDOWN)
2087			peer_mode |= SEND_SHUTDOWN;
2088		if (mode&SEND_SHUTDOWN)
2089			peer_mode |= RCV_SHUTDOWN;
2090		unix_state_lock(other);
2091		other->sk_shutdown |= peer_mode;
2092		unix_state_unlock(other);
2093		other->sk_state_change(other);
2094		if (peer_mode == SHUTDOWN_MASK)
2095			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2096		else if (peer_mode & RCV_SHUTDOWN)
2097			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2098	}
2099	if (other)
2100		sock_put(other);
2101
2102	return 0;
2103}
2104
2105long unix_inq_len(struct sock *sk)
2106{
2107	struct sk_buff *skb;
2108	long amount = 0;
2109
2110	if (sk->sk_state == TCP_LISTEN)
2111		return -EINVAL;
2112
2113	spin_lock(&sk->sk_receive_queue.lock);
2114	if (sk->sk_type == SOCK_STREAM ||
2115	    sk->sk_type == SOCK_SEQPACKET) {
2116		skb_queue_walk(&sk->sk_receive_queue, skb)
2117			amount += unix_skb_len(skb);
2118	} else {
2119		skb = skb_peek(&sk->sk_receive_queue);
2120		if (skb)
2121			amount = skb->len;
2122	}
2123	spin_unlock(&sk->sk_receive_queue.lock);
2124
2125	return amount;
2126}
2127EXPORT_SYMBOL_GPL(unix_inq_len);
2128
2129long unix_outq_len(struct sock *sk)
2130{
2131	return sk_wmem_alloc_get(sk);
2132}
2133EXPORT_SYMBOL_GPL(unix_outq_len);
2134
2135static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2136{
2137	struct sock *sk = sock->sk;
2138	long amount = 0;
2139	int err;
2140
2141	switch (cmd) {
2142	case SIOCOUTQ:
2143		amount = unix_outq_len(sk);
2144		err = put_user(amount, (int __user *)arg);
2145		break;
2146	case SIOCINQ:
2147		amount = unix_inq_len(sk);
2148		if (amount < 0)
2149			err = amount;
2150		else
2151			err = put_user(amount, (int __user *)arg);
2152		break;
2153	default:
2154		err = -ENOIOCTLCMD;
2155		break;
2156	}
2157	return err;
2158}
2159
2160static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2161{
2162	struct sock *sk = sock->sk;
2163	unsigned int mask;
2164
2165	sock_poll_wait(file, sk_sleep(sk), wait);
2166	mask = 0;
2167
2168	/* exceptional events? */
2169	if (sk->sk_err)
2170		mask |= POLLERR;
2171	if (sk->sk_shutdown == SHUTDOWN_MASK)
2172		mask |= POLLHUP;
2173	if (sk->sk_shutdown & RCV_SHUTDOWN)
2174		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2175
2176	/* readable? */
2177	if (!skb_queue_empty(&sk->sk_receive_queue))
2178		mask |= POLLIN | POLLRDNORM;
2179
2180	/* Connection-based need to check for termination and startup */
2181	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2182	    sk->sk_state == TCP_CLOSE)
2183		mask |= POLLHUP;
2184
2185	/*
2186	 * we set writable also when the other side has shut down the
2187	 * connection. This prevents stuck sockets.
2188	 */
2189	if (unix_writable(sk))
2190		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2191
2192	return mask;
2193}
2194
2195static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2196				    poll_table *wait)
2197{
2198	struct sock *sk = sock->sk, *other;
2199	unsigned int mask, writable;
2200
2201	sock_poll_wait(file, sk_sleep(sk), wait);
2202	mask = 0;
2203
2204	/* exceptional events? */
2205	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2206		mask |= POLLERR |
2207			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2208
2209	if (sk->sk_shutdown & RCV_SHUTDOWN)
2210		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2211	if (sk->sk_shutdown == SHUTDOWN_MASK)
2212		mask |= POLLHUP;
2213
2214	/* readable? */
2215	if (!skb_queue_empty(&sk->sk_receive_queue))
2216		mask |= POLLIN | POLLRDNORM;
2217
2218	/* Connection-based need to check for termination and startup */
2219	if (sk->sk_type == SOCK_SEQPACKET) {
2220		if (sk->sk_state == TCP_CLOSE)
2221			mask |= POLLHUP;
2222		/* connection hasn't started yet? */
2223		if (sk->sk_state == TCP_SYN_SENT)
2224			return mask;
2225	}
2226
2227	/* No write status requested, avoid expensive OUT tests. */
2228	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2229		return mask;
2230
2231	writable = unix_writable(sk);
2232	other = unix_peer_get(sk);
2233	if (other) {
2234		if (unix_peer(other) != sk) {
2235			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2236			if (unix_recvq_full(other))
2237				writable = 0;
2238		}
2239		sock_put(other);
2240	}
2241
2242	if (writable)
2243		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2244	else
2245		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2246
2247	return mask;
2248}
2249
2250#ifdef CONFIG_PROC_FS
2251
2252#define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2253
2254#define get_bucket(x) ((x) >> BUCKET_SPACE)
2255#define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2256#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2257
2258static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2259{
2260	unsigned long offset = get_offset(*pos);
2261	unsigned long bucket = get_bucket(*pos);
2262	struct sock *sk;
2263	unsigned long count = 0;
2264
2265	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2266		if (sock_net(sk) != seq_file_net(seq))
2267			continue;
2268		if (++count == offset)
2269			break;
2270	}
2271
2272	return sk;
2273}
2274
2275static struct sock *unix_next_socket(struct seq_file *seq,
2276				     struct sock *sk,
2277				     loff_t *pos)
2278{
2279	unsigned long bucket;
2280
2281	while (sk > (struct sock *)SEQ_START_TOKEN) {
2282		sk = sk_next(sk);
2283		if (!sk)
2284			goto next_bucket;
2285		if (sock_net(sk) == seq_file_net(seq))
2286			return sk;
2287	}
2288
2289	do {
2290		sk = unix_from_bucket(seq, pos);
2291		if (sk)
2292			return sk;
2293
2294next_bucket:
2295		bucket = get_bucket(*pos) + 1;
2296		*pos = set_bucket_offset(bucket, 1);
2297	} while (bucket < ARRAY_SIZE(unix_socket_table));
2298
2299	return NULL;
2300}
2301
2302static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2303	__acquires(unix_table_lock)
2304{
2305	spin_lock(&unix_table_lock);
2306
2307	if (!*pos)
2308		return SEQ_START_TOKEN;
2309
2310	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2311		return NULL;
2312
2313	return unix_next_socket(seq, NULL, pos);
2314}
2315
2316static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2317{
2318	++*pos;
2319	return unix_next_socket(seq, v, pos);
2320}
2321
2322static void unix_seq_stop(struct seq_file *seq, void *v)
2323	__releases(unix_table_lock)
2324{
2325	spin_unlock(&unix_table_lock);
2326}
2327
2328static int unix_seq_show(struct seq_file *seq, void *v)
2329{
2330
2331	if (v == SEQ_START_TOKEN)
2332		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2333			 "Inode Path\n");
2334	else {
2335		struct sock *s = v;
2336		struct unix_sock *u = unix_sk(s);
2337		unix_state_lock(s);
2338
2339		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2340			s,
2341			atomic_read(&s->sk_refcnt),
2342			0,
2343			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2344			s->sk_type,
2345			s->sk_socket ?
2346			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2347			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2348			sock_i_ino(s));
2349
2350		if (u->addr) {
2351			int i, len;
2352			seq_putc(seq, ' ');
2353
2354			i = 0;
2355			len = u->addr->len - sizeof(short);
2356			if (!UNIX_ABSTRACT(s))
2357				len--;
2358			else {
2359				seq_putc(seq, '@');
2360				i++;
2361			}
2362			for ( ; i < len; i++)
2363				seq_putc(seq, u->addr->name->sun_path[i]);
2364		}
2365		unix_state_unlock(s);
2366		seq_putc(seq, '\n');
2367	}
2368
2369	return 0;
2370}
2371
2372static const struct seq_operations unix_seq_ops = {
2373	.start  = unix_seq_start,
2374	.next   = unix_seq_next,
2375	.stop   = unix_seq_stop,
2376	.show   = unix_seq_show,
2377};
2378
2379static int unix_seq_open(struct inode *inode, struct file *file)
2380{
2381	return seq_open_net(inode, file, &unix_seq_ops,
2382			    sizeof(struct seq_net_private));
2383}
2384
2385static const struct file_operations unix_seq_fops = {
2386	.owner		= THIS_MODULE,
2387	.open		= unix_seq_open,
2388	.read		= seq_read,
2389	.llseek		= seq_lseek,
2390	.release	= seq_release_net,
2391};
2392
2393#endif
2394
2395static const struct net_proto_family unix_family_ops = {
2396	.family = PF_UNIX,
2397	.create = unix_create,
2398	.owner	= THIS_MODULE,
2399};
2400
2401
2402static int __net_init unix_net_init(struct net *net)
2403{
2404	int error = -ENOMEM;
2405
2406	net->unx.sysctl_max_dgram_qlen = 10;
2407	if (unix_sysctl_register(net))
2408		goto out;
2409
2410#ifdef CONFIG_PROC_FS
2411	if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2412		unix_sysctl_unregister(net);
2413		goto out;
2414	}
2415#endif
2416	error = 0;
2417out:
2418	return error;
2419}
2420
2421static void __net_exit unix_net_exit(struct net *net)
2422{
2423	unix_sysctl_unregister(net);
2424	remove_proc_entry("unix", net->proc_net);
2425}
2426
2427static struct pernet_operations unix_net_ops = {
2428	.init = unix_net_init,
2429	.exit = unix_net_exit,
2430};
2431
2432static int __init af_unix_init(void)
2433{
2434	int rc = -1;
2435
2436	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2437
2438	rc = proto_register(&unix_proto, 1);
2439	if (rc != 0) {
2440		printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2441		       __func__);
2442		goto out;
2443	}
2444
2445	sock_register(&unix_family_ops);
2446	register_pernet_subsys(&unix_net_ops);
2447out:
2448	return rc;
2449}
2450
2451static void __exit af_unix_exit(void)
2452{
2453	sock_unregister(PF_UNIX);
2454	proto_unregister(&unix_proto);
2455	unregister_pernet_subsys(&unix_net_ops);
2456}
2457
2458/* Earlier than device_initcall() so that other drivers invoking
2459   request_module() don't end up in a loop when modprobe tries
2460   to use a UNIX socket. But later than subsys_initcall() because
2461   we depend on stuff initialised there */
2462fs_initcall(af_unix_init);
2463module_exit(af_unix_exit);
2464
2465MODULE_LICENSE("GPL");
2466MODULE_ALIAS_NETPROTO(PF_UNIX);
2467