af_unix.c revision fc0d753641f7b919c7273d9bd21ae6ab45e757f3
1/*
2 * NET4:	Implementation of BSD Unix domain sockets.
3 *
4 * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
5 *
6 *		This program is free software; you can redistribute it and/or
7 *		modify it under the terms of the GNU General Public License
8 *		as published by the Free Software Foundation; either version
9 *		2 of the License, or (at your option) any later version.
10 *
11 * Fixes:
12 *		Linus Torvalds	:	Assorted bug cures.
13 *		Niibe Yutaka	:	async I/O support.
14 *		Carsten Paeth	:	PF_UNIX check, address fixes.
15 *		Alan Cox	:	Limit size of allocated blocks.
16 *		Alan Cox	:	Fixed the stupid socketpair bug.
17 *		Alan Cox	:	BSD compatibility fine tuning.
18 *		Alan Cox	:	Fixed a bug in connect when interrupted.
19 *		Alan Cox	:	Sorted out a proper draft version of
20 *					file descriptor passing hacked up from
21 *					Mike Shaver's work.
22 *		Marty Leisner	:	Fixes to fd passing
23 *		Nick Nevin	:	recvmsg bugfix.
24 *		Alan Cox	:	Started proper garbage collector
25 *		Heiko EiBfeldt	:	Missing verify_area check
26 *		Alan Cox	:	Started POSIXisms
27 *		Andreas Schwab	:	Replace inode by dentry for proper
28 *					reference counting
29 *		Kirk Petersen	:	Made this a module
30 *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
31 *					Lots of bug fixes.
32 *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
33 *					by above two patches.
34 *	     Andrea Arcangeli	:	If possible we block in connect(2)
35 *					if the max backlog of the listen socket
36 *					is been reached. This won't break
37 *					old apps and it will avoid huge amount
38 *					of socks hashed (this for unix_gc()
39 *					performances reasons).
40 *					Security fix that limits the max
41 *					number of socks to 2*max_files and
42 *					the number of skb queueable in the
43 *					dgram receiver.
44 *		Artur Skawina   :	Hash function optimizations
45 *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
46 *	      Malcolm Beattie   :	Set peercred for socketpair
47 *	     Michal Ostrowski   :       Module initialization cleanup.
48 *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
49 *	     				the core infrastructure is doing that
50 *	     				for all net proto families now (2.5.69+)
51 *
52 *
53 * Known differences from reference BSD that was tested:
54 *
55 *	[TO FIX]
56 *	ECONNREFUSED is not returned from one end of a connected() socket to the
57 *		other the moment one end closes.
58 *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
59 *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
60 *	[NOT TO FIX]
61 *	accept() returns a path name even if the connecting socket has closed
62 *		in the meantime (BSD loses the path and gives up).
63 *	accept() returns 0 length path for an unbound connector. BSD returns 16
64 *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65 *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
66 *	BSD af_unix apparently has connect forgetting to block properly.
67 *		(need to check this with the POSIX spec in detail)
68 *
69 * Differences from 2.0.0-11-... (ANK)
70 *	Bug fixes and improvements.
71 *		- client shutdown killed server socket.
72 *		- removed all useless cli/sti pairs.
73 *
74 *	Semantic changes/extensions.
75 *		- generic control message passing.
76 *		- SCM_CREDENTIALS control message.
77 *		- "Abstract" (not FS based) socket bindings.
78 *		  Abstract names are sequences of bytes (not zero terminated)
79 *		  started by 0, so that this name space does not intersect
80 *		  with BSD names.
81 */
82
83#include <linux/module.h>
84#include <linux/kernel.h>
85#include <linux/signal.h>
86#include <linux/sched.h>
87#include <linux/errno.h>
88#include <linux/string.h>
89#include <linux/stat.h>
90#include <linux/dcache.h>
91#include <linux/namei.h>
92#include <linux/socket.h>
93#include <linux/un.h>
94#include <linux/fcntl.h>
95#include <linux/termios.h>
96#include <linux/sockios.h>
97#include <linux/net.h>
98#include <linux/in.h>
99#include <linux/fs.h>
100#include <linux/slab.h>
101#include <asm/uaccess.h>
102#include <linux/skbuff.h>
103#include <linux/netdevice.h>
104#include <net/net_namespace.h>
105#include <net/sock.h>
106#include <net/tcp_states.h>
107#include <net/af_unix.h>
108#include <linux/proc_fs.h>
109#include <linux/seq_file.h>
110#include <net/scm.h>
111#include <linux/init.h>
112#include <linux/poll.h>
113#include <linux/rtnetlink.h>
114#include <linux/mount.h>
115#include <net/checksum.h>
116#include <linux/security.h>
117
118struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
119EXPORT_SYMBOL_GPL(unix_socket_table);
120DEFINE_SPINLOCK(unix_table_lock);
121EXPORT_SYMBOL_GPL(unix_table_lock);
122static atomic_long_t unix_nr_socks;
123
124#define unix_sockets_unbound	(&unix_socket_table[UNIX_HASH_SIZE])
125
126#define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
127
128#ifdef CONFIG_SECURITY_NETWORK
129static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
130{
131	memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
132}
133
134static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
135{
136	scm->secid = *UNIXSID(skb);
137}
138#else
139static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
140{ }
141
142static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
143{ }
144#endif /* CONFIG_SECURITY_NETWORK */
145
146/*
147 *  SMP locking strategy:
148 *    hash table is protected with spinlock unix_table_lock
149 *    each socket state is protected by separate spin lock.
150 */
151
152static inline unsigned unix_hash_fold(__wsum n)
153{
154	unsigned hash = (__force unsigned)n;
155	hash ^= hash>>16;
156	hash ^= hash>>8;
157	return hash&(UNIX_HASH_SIZE-1);
158}
159
160#define unix_peer(sk) (unix_sk(sk)->peer)
161
162static inline int unix_our_peer(struct sock *sk, struct sock *osk)
163{
164	return unix_peer(osk) == sk;
165}
166
167static inline int unix_may_send(struct sock *sk, struct sock *osk)
168{
169	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
170}
171
172static inline int unix_recvq_full(struct sock const *sk)
173{
174	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
175}
176
177struct sock *unix_peer_get(struct sock *s)
178{
179	struct sock *peer;
180
181	unix_state_lock(s);
182	peer = unix_peer(s);
183	if (peer)
184		sock_hold(peer);
185	unix_state_unlock(s);
186	return peer;
187}
188EXPORT_SYMBOL_GPL(unix_peer_get);
189
190static inline void unix_release_addr(struct unix_address *addr)
191{
192	if (atomic_dec_and_test(&addr->refcnt))
193		kfree(addr);
194}
195
196/*
197 *	Check unix socket name:
198 *		- should be not zero length.
199 *	        - if started by not zero, should be NULL terminated (FS object)
200 *		- if started by zero, it is abstract name.
201 */
202
203static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned *hashp)
204{
205	if (len <= sizeof(short) || len > sizeof(*sunaddr))
206		return -EINVAL;
207	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
208		return -EINVAL;
209	if (sunaddr->sun_path[0]) {
210		/*
211		 * This may look like an off by one error but it is a bit more
212		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
213		 * sun_path[108] doesn't as such exist.  However in kernel space
214		 * we are guaranteed that it is a valid memory location in our
215		 * kernel address buffer.
216		 */
217		((char *)sunaddr)[len] = 0;
218		len = strlen(sunaddr->sun_path)+1+sizeof(short);
219		return len;
220	}
221
222	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
223	return len;
224}
225
226static void __unix_remove_socket(struct sock *sk)
227{
228	sk_del_node_init(sk);
229}
230
231static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
232{
233	WARN_ON(!sk_unhashed(sk));
234	sk_add_node(sk, list);
235}
236
237static inline void unix_remove_socket(struct sock *sk)
238{
239	spin_lock(&unix_table_lock);
240	__unix_remove_socket(sk);
241	spin_unlock(&unix_table_lock);
242}
243
244static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
245{
246	spin_lock(&unix_table_lock);
247	__unix_insert_socket(list, sk);
248	spin_unlock(&unix_table_lock);
249}
250
251static struct sock *__unix_find_socket_byname(struct net *net,
252					      struct sockaddr_un *sunname,
253					      int len, int type, unsigned hash)
254{
255	struct sock *s;
256	struct hlist_node *node;
257
258	sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
259		struct unix_sock *u = unix_sk(s);
260
261		if (!net_eq(sock_net(s), net))
262			continue;
263
264		if (u->addr->len == len &&
265		    !memcmp(u->addr->name, sunname, len))
266			goto found;
267	}
268	s = NULL;
269found:
270	return s;
271}
272
273static inline struct sock *unix_find_socket_byname(struct net *net,
274						   struct sockaddr_un *sunname,
275						   int len, int type,
276						   unsigned hash)
277{
278	struct sock *s;
279
280	spin_lock(&unix_table_lock);
281	s = __unix_find_socket_byname(net, sunname, len, type, hash);
282	if (s)
283		sock_hold(s);
284	spin_unlock(&unix_table_lock);
285	return s;
286}
287
288static struct sock *unix_find_socket_byinode(struct inode *i)
289{
290	struct sock *s;
291	struct hlist_node *node;
292
293	spin_lock(&unix_table_lock);
294	sk_for_each(s, node,
295		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
296		struct dentry *dentry = unix_sk(s)->dentry;
297
298		if (dentry && dentry->d_inode == i) {
299			sock_hold(s);
300			goto found;
301		}
302	}
303	s = NULL;
304found:
305	spin_unlock(&unix_table_lock);
306	return s;
307}
308
309static inline int unix_writable(struct sock *sk)
310{
311	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
312}
313
314static void unix_write_space(struct sock *sk)
315{
316	struct socket_wq *wq;
317
318	rcu_read_lock();
319	if (unix_writable(sk)) {
320		wq = rcu_dereference(sk->sk_wq);
321		if (wq_has_sleeper(wq))
322			wake_up_interruptible_sync_poll(&wq->wait,
323				POLLOUT | POLLWRNORM | POLLWRBAND);
324		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
325	}
326	rcu_read_unlock();
327}
328
329/* When dgram socket disconnects (or changes its peer), we clear its receive
330 * queue of packets arrived from previous peer. First, it allows to do
331 * flow control based only on wmem_alloc; second, sk connected to peer
332 * may receive messages only from that peer. */
333static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
334{
335	if (!skb_queue_empty(&sk->sk_receive_queue)) {
336		skb_queue_purge(&sk->sk_receive_queue);
337		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
338
339		/* If one link of bidirectional dgram pipe is disconnected,
340		 * we signal error. Messages are lost. Do not make this,
341		 * when peer was not connected to us.
342		 */
343		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
344			other->sk_err = ECONNRESET;
345			other->sk_error_report(other);
346		}
347	}
348}
349
350static void unix_sock_destructor(struct sock *sk)
351{
352	struct unix_sock *u = unix_sk(sk);
353
354	skb_queue_purge(&sk->sk_receive_queue);
355
356	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
357	WARN_ON(!sk_unhashed(sk));
358	WARN_ON(sk->sk_socket);
359	if (!sock_flag(sk, SOCK_DEAD)) {
360		printk(KERN_INFO "Attempt to release alive unix socket: %p\n", sk);
361		return;
362	}
363
364	if (u->addr)
365		unix_release_addr(u->addr);
366
367	atomic_long_dec(&unix_nr_socks);
368	local_bh_disable();
369	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
370	local_bh_enable();
371#ifdef UNIX_REFCNT_DEBUG
372	printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
373		atomic_long_read(&unix_nr_socks));
374#endif
375}
376
377static int unix_release_sock(struct sock *sk, int embrion)
378{
379	struct unix_sock *u = unix_sk(sk);
380	struct dentry *dentry;
381	struct vfsmount *mnt;
382	struct sock *skpair;
383	struct sk_buff *skb;
384	int state;
385
386	unix_remove_socket(sk);
387
388	/* Clear state */
389	unix_state_lock(sk);
390	sock_orphan(sk);
391	sk->sk_shutdown = SHUTDOWN_MASK;
392	dentry	     = u->dentry;
393	u->dentry    = NULL;
394	mnt	     = u->mnt;
395	u->mnt	     = NULL;
396	state = sk->sk_state;
397	sk->sk_state = TCP_CLOSE;
398	unix_state_unlock(sk);
399
400	wake_up_interruptible_all(&u->peer_wait);
401
402	skpair = unix_peer(sk);
403
404	if (skpair != NULL) {
405		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
406			unix_state_lock(skpair);
407			/* No more writes */
408			skpair->sk_shutdown = SHUTDOWN_MASK;
409			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
410				skpair->sk_err = ECONNRESET;
411			unix_state_unlock(skpair);
412			skpair->sk_state_change(skpair);
413			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
414		}
415		sock_put(skpair); /* It may now die */
416		unix_peer(sk) = NULL;
417	}
418
419	/* Try to flush out this socket. Throw out buffers at least */
420
421	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
422		if (state == TCP_LISTEN)
423			unix_release_sock(skb->sk, 1);
424		/* passed fds are erased in the kfree_skb hook	      */
425		kfree_skb(skb);
426	}
427
428	if (dentry) {
429		dput(dentry);
430		mntput(mnt);
431	}
432
433	sock_put(sk);
434
435	/* ---- Socket is dead now and most probably destroyed ---- */
436
437	/*
438	 * Fixme: BSD difference: In BSD all sockets connected to use get
439	 *	  ECONNRESET and we die on the spot. In Linux we behave
440	 *	  like files and pipes do and wait for the last
441	 *	  dereference.
442	 *
443	 * Can't we simply set sock->err?
444	 *
445	 *	  What the above comment does talk about? --ANK(980817)
446	 */
447
448	if (unix_tot_inflight)
449		unix_gc();		/* Garbage collect fds */
450
451	return 0;
452}
453
454static void init_peercred(struct sock *sk)
455{
456	put_pid(sk->sk_peer_pid);
457	if (sk->sk_peer_cred)
458		put_cred(sk->sk_peer_cred);
459	sk->sk_peer_pid  = get_pid(task_tgid(current));
460	sk->sk_peer_cred = get_current_cred();
461}
462
463static void copy_peercred(struct sock *sk, struct sock *peersk)
464{
465	put_pid(sk->sk_peer_pid);
466	if (sk->sk_peer_cred)
467		put_cred(sk->sk_peer_cred);
468	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
469	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
470}
471
472static int unix_listen(struct socket *sock, int backlog)
473{
474	int err;
475	struct sock *sk = sock->sk;
476	struct unix_sock *u = unix_sk(sk);
477	struct pid *old_pid = NULL;
478	const struct cred *old_cred = NULL;
479
480	err = -EOPNOTSUPP;
481	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
482		goto out;	/* Only stream/seqpacket sockets accept */
483	err = -EINVAL;
484	if (!u->addr)
485		goto out;	/* No listens on an unbound socket */
486	unix_state_lock(sk);
487	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
488		goto out_unlock;
489	if (backlog > sk->sk_max_ack_backlog)
490		wake_up_interruptible_all(&u->peer_wait);
491	sk->sk_max_ack_backlog	= backlog;
492	sk->sk_state		= TCP_LISTEN;
493	/* set credentials so connect can copy them */
494	init_peercred(sk);
495	err = 0;
496
497out_unlock:
498	unix_state_unlock(sk);
499	put_pid(old_pid);
500	if (old_cred)
501		put_cred(old_cred);
502out:
503	return err;
504}
505
506static int unix_release(struct socket *);
507static int unix_bind(struct socket *, struct sockaddr *, int);
508static int unix_stream_connect(struct socket *, struct sockaddr *,
509			       int addr_len, int flags);
510static int unix_socketpair(struct socket *, struct socket *);
511static int unix_accept(struct socket *, struct socket *, int);
512static int unix_getname(struct socket *, struct sockaddr *, int *, int);
513static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
514static unsigned int unix_dgram_poll(struct file *, struct socket *,
515				    poll_table *);
516static int unix_ioctl(struct socket *, unsigned int, unsigned long);
517static int unix_shutdown(struct socket *, int);
518static int unix_stream_sendmsg(struct kiocb *, struct socket *,
519			       struct msghdr *, size_t);
520static int unix_stream_recvmsg(struct kiocb *, struct socket *,
521			       struct msghdr *, size_t, int);
522static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
523			      struct msghdr *, size_t);
524static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
525			      struct msghdr *, size_t, int);
526static int unix_dgram_connect(struct socket *, struct sockaddr *,
527			      int, int);
528static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
529				  struct msghdr *, size_t);
530static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
531				  struct msghdr *, size_t, int);
532
533static void unix_set_peek_off(struct sock *sk, int val)
534{
535	struct unix_sock *u = unix_sk(sk);
536
537	mutex_lock(&u->readlock);
538	sk->sk_peek_off = val;
539	mutex_unlock(&u->readlock);
540}
541
542
543static const struct proto_ops unix_stream_ops = {
544	.family =	PF_UNIX,
545	.owner =	THIS_MODULE,
546	.release =	unix_release,
547	.bind =		unix_bind,
548	.connect =	unix_stream_connect,
549	.socketpair =	unix_socketpair,
550	.accept =	unix_accept,
551	.getname =	unix_getname,
552	.poll =		unix_poll,
553	.ioctl =	unix_ioctl,
554	.listen =	unix_listen,
555	.shutdown =	unix_shutdown,
556	.setsockopt =	sock_no_setsockopt,
557	.getsockopt =	sock_no_getsockopt,
558	.sendmsg =	unix_stream_sendmsg,
559	.recvmsg =	unix_stream_recvmsg,
560	.mmap =		sock_no_mmap,
561	.sendpage =	sock_no_sendpage,
562	.set_peek_off =	unix_set_peek_off,
563};
564
565static const struct proto_ops unix_dgram_ops = {
566	.family =	PF_UNIX,
567	.owner =	THIS_MODULE,
568	.release =	unix_release,
569	.bind =		unix_bind,
570	.connect =	unix_dgram_connect,
571	.socketpair =	unix_socketpair,
572	.accept =	sock_no_accept,
573	.getname =	unix_getname,
574	.poll =		unix_dgram_poll,
575	.ioctl =	unix_ioctl,
576	.listen =	sock_no_listen,
577	.shutdown =	unix_shutdown,
578	.setsockopt =	sock_no_setsockopt,
579	.getsockopt =	sock_no_getsockopt,
580	.sendmsg =	unix_dgram_sendmsg,
581	.recvmsg =	unix_dgram_recvmsg,
582	.mmap =		sock_no_mmap,
583	.sendpage =	sock_no_sendpage,
584	.set_peek_off =	unix_set_peek_off,
585};
586
587static const struct proto_ops unix_seqpacket_ops = {
588	.family =	PF_UNIX,
589	.owner =	THIS_MODULE,
590	.release =	unix_release,
591	.bind =		unix_bind,
592	.connect =	unix_stream_connect,
593	.socketpair =	unix_socketpair,
594	.accept =	unix_accept,
595	.getname =	unix_getname,
596	.poll =		unix_dgram_poll,
597	.ioctl =	unix_ioctl,
598	.listen =	unix_listen,
599	.shutdown =	unix_shutdown,
600	.setsockopt =	sock_no_setsockopt,
601	.getsockopt =	sock_no_getsockopt,
602	.sendmsg =	unix_seqpacket_sendmsg,
603	.recvmsg =	unix_seqpacket_recvmsg,
604	.mmap =		sock_no_mmap,
605	.sendpage =	sock_no_sendpage,
606	.set_peek_off =	unix_set_peek_off,
607};
608
609static struct proto unix_proto = {
610	.name			= "UNIX",
611	.owner			= THIS_MODULE,
612	.obj_size		= sizeof(struct unix_sock),
613};
614
615/*
616 * AF_UNIX sockets do not interact with hardware, hence they
617 * dont trigger interrupts - so it's safe for them to have
618 * bh-unsafe locking for their sk_receive_queue.lock. Split off
619 * this special lock-class by reinitializing the spinlock key:
620 */
621static struct lock_class_key af_unix_sk_receive_queue_lock_key;
622
623static struct sock *unix_create1(struct net *net, struct socket *sock)
624{
625	struct sock *sk = NULL;
626	struct unix_sock *u;
627
628	atomic_long_inc(&unix_nr_socks);
629	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
630		goto out;
631
632	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
633	if (!sk)
634		goto out;
635
636	sock_init_data(sock, sk);
637	lockdep_set_class(&sk->sk_receive_queue.lock,
638				&af_unix_sk_receive_queue_lock_key);
639
640	sk->sk_write_space	= unix_write_space;
641	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
642	sk->sk_destruct		= unix_sock_destructor;
643	u	  = unix_sk(sk);
644	u->dentry = NULL;
645	u->mnt	  = NULL;
646	spin_lock_init(&u->lock);
647	atomic_long_set(&u->inflight, 0);
648	INIT_LIST_HEAD(&u->link);
649	mutex_init(&u->readlock); /* single task reading lock */
650	init_waitqueue_head(&u->peer_wait);
651	unix_insert_socket(unix_sockets_unbound, sk);
652out:
653	if (sk == NULL)
654		atomic_long_dec(&unix_nr_socks);
655	else {
656		local_bh_disable();
657		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
658		local_bh_enable();
659	}
660	return sk;
661}
662
663static int unix_create(struct net *net, struct socket *sock, int protocol,
664		       int kern)
665{
666	if (protocol && protocol != PF_UNIX)
667		return -EPROTONOSUPPORT;
668
669	sock->state = SS_UNCONNECTED;
670
671	switch (sock->type) {
672	case SOCK_STREAM:
673		sock->ops = &unix_stream_ops;
674		break;
675		/*
676		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
677		 *	nothing uses it.
678		 */
679	case SOCK_RAW:
680		sock->type = SOCK_DGRAM;
681	case SOCK_DGRAM:
682		sock->ops = &unix_dgram_ops;
683		break;
684	case SOCK_SEQPACKET:
685		sock->ops = &unix_seqpacket_ops;
686		break;
687	default:
688		return -ESOCKTNOSUPPORT;
689	}
690
691	return unix_create1(net, sock) ? 0 : -ENOMEM;
692}
693
694static int unix_release(struct socket *sock)
695{
696	struct sock *sk = sock->sk;
697
698	if (!sk)
699		return 0;
700
701	sock->sk = NULL;
702
703	return unix_release_sock(sk, 0);
704}
705
706static int unix_autobind(struct socket *sock)
707{
708	struct sock *sk = sock->sk;
709	struct net *net = sock_net(sk);
710	struct unix_sock *u = unix_sk(sk);
711	static u32 ordernum = 1;
712	struct unix_address *addr;
713	int err;
714	unsigned int retries = 0;
715
716	mutex_lock(&u->readlock);
717
718	err = 0;
719	if (u->addr)
720		goto out;
721
722	err = -ENOMEM;
723	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
724	if (!addr)
725		goto out;
726
727	addr->name->sun_family = AF_UNIX;
728	atomic_set(&addr->refcnt, 1);
729
730retry:
731	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
732	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
733
734	spin_lock(&unix_table_lock);
735	ordernum = (ordernum+1)&0xFFFFF;
736
737	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
738				      addr->hash)) {
739		spin_unlock(&unix_table_lock);
740		/*
741		 * __unix_find_socket_byname() may take long time if many names
742		 * are already in use.
743		 */
744		cond_resched();
745		/* Give up if all names seems to be in use. */
746		if (retries++ == 0xFFFFF) {
747			err = -ENOSPC;
748			kfree(addr);
749			goto out;
750		}
751		goto retry;
752	}
753	addr->hash ^= sk->sk_type;
754
755	__unix_remove_socket(sk);
756	u->addr = addr;
757	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
758	spin_unlock(&unix_table_lock);
759	err = 0;
760
761out:	mutex_unlock(&u->readlock);
762	return err;
763}
764
765static struct sock *unix_find_other(struct net *net,
766				    struct sockaddr_un *sunname, int len,
767				    int type, unsigned hash, int *error)
768{
769	struct sock *u;
770	struct path path;
771	int err = 0;
772
773	if (sunname->sun_path[0]) {
774		struct inode *inode;
775		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
776		if (err)
777			goto fail;
778		inode = path.dentry->d_inode;
779		err = inode_permission(inode, MAY_WRITE);
780		if (err)
781			goto put_fail;
782
783		err = -ECONNREFUSED;
784		if (!S_ISSOCK(inode->i_mode))
785			goto put_fail;
786		u = unix_find_socket_byinode(inode);
787		if (!u)
788			goto put_fail;
789
790		if (u->sk_type == type)
791			touch_atime(path.mnt, path.dentry);
792
793		path_put(&path);
794
795		err = -EPROTOTYPE;
796		if (u->sk_type != type) {
797			sock_put(u);
798			goto fail;
799		}
800	} else {
801		err = -ECONNREFUSED;
802		u = unix_find_socket_byname(net, sunname, len, type, hash);
803		if (u) {
804			struct dentry *dentry;
805			dentry = unix_sk(u)->dentry;
806			if (dentry)
807				touch_atime(unix_sk(u)->mnt, dentry);
808		} else
809			goto fail;
810	}
811	return u;
812
813put_fail:
814	path_put(&path);
815fail:
816	*error = err;
817	return NULL;
818}
819
820
821static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
822{
823	struct sock *sk = sock->sk;
824	struct net *net = sock_net(sk);
825	struct unix_sock *u = unix_sk(sk);
826	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
827	char *sun_path = sunaddr->sun_path;
828	struct dentry *dentry = NULL;
829	struct path path;
830	int err;
831	unsigned hash;
832	struct unix_address *addr;
833	struct hlist_head *list;
834
835	err = -EINVAL;
836	if (sunaddr->sun_family != AF_UNIX)
837		goto out;
838
839	if (addr_len == sizeof(short)) {
840		err = unix_autobind(sock);
841		goto out;
842	}
843
844	err = unix_mkname(sunaddr, addr_len, &hash);
845	if (err < 0)
846		goto out;
847	addr_len = err;
848
849	mutex_lock(&u->readlock);
850
851	err = -EINVAL;
852	if (u->addr)
853		goto out_up;
854
855	err = -ENOMEM;
856	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
857	if (!addr)
858		goto out_up;
859
860	memcpy(addr->name, sunaddr, addr_len);
861	addr->len = addr_len;
862	addr->hash = hash ^ sk->sk_type;
863	atomic_set(&addr->refcnt, 1);
864
865	if (sun_path[0]) {
866		umode_t mode;
867		err = 0;
868		/*
869		 * Get the parent directory, calculate the hash for last
870		 * component.
871		 */
872		dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
873		err = PTR_ERR(dentry);
874		if (IS_ERR(dentry))
875			goto out_mknod_parent;
876
877		/*
878		 * All right, let's create it.
879		 */
880		mode = S_IFSOCK |
881		       (SOCK_INODE(sock)->i_mode & ~current_umask());
882		err = mnt_want_write(path.mnt);
883		if (err)
884			goto out_mknod_dput;
885		err = security_path_mknod(&path, dentry, mode, 0);
886		if (err)
887			goto out_mknod_drop_write;
888		err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
889out_mknod_drop_write:
890		mnt_drop_write(path.mnt);
891		if (err)
892			goto out_mknod_dput;
893		mutex_unlock(&path.dentry->d_inode->i_mutex);
894		dput(path.dentry);
895		path.dentry = dentry;
896
897		addr->hash = UNIX_HASH_SIZE;
898	}
899
900	spin_lock(&unix_table_lock);
901
902	if (!sun_path[0]) {
903		err = -EADDRINUSE;
904		if (__unix_find_socket_byname(net, sunaddr, addr_len,
905					      sk->sk_type, hash)) {
906			unix_release_addr(addr);
907			goto out_unlock;
908		}
909
910		list = &unix_socket_table[addr->hash];
911	} else {
912		list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
913		u->dentry = path.dentry;
914		u->mnt    = path.mnt;
915	}
916
917	err = 0;
918	__unix_remove_socket(sk);
919	u->addr = addr;
920	__unix_insert_socket(list, sk);
921
922out_unlock:
923	spin_unlock(&unix_table_lock);
924out_up:
925	mutex_unlock(&u->readlock);
926out:
927	return err;
928
929out_mknod_dput:
930	dput(dentry);
931	mutex_unlock(&path.dentry->d_inode->i_mutex);
932	path_put(&path);
933out_mknod_parent:
934	if (err == -EEXIST)
935		err = -EADDRINUSE;
936	unix_release_addr(addr);
937	goto out_up;
938}
939
940static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
941{
942	if (unlikely(sk1 == sk2) || !sk2) {
943		unix_state_lock(sk1);
944		return;
945	}
946	if (sk1 < sk2) {
947		unix_state_lock(sk1);
948		unix_state_lock_nested(sk2);
949	} else {
950		unix_state_lock(sk2);
951		unix_state_lock_nested(sk1);
952	}
953}
954
955static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
956{
957	if (unlikely(sk1 == sk2) || !sk2) {
958		unix_state_unlock(sk1);
959		return;
960	}
961	unix_state_unlock(sk1);
962	unix_state_unlock(sk2);
963}
964
965static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
966			      int alen, int flags)
967{
968	struct sock *sk = sock->sk;
969	struct net *net = sock_net(sk);
970	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
971	struct sock *other;
972	unsigned hash;
973	int err;
974
975	if (addr->sa_family != AF_UNSPEC) {
976		err = unix_mkname(sunaddr, alen, &hash);
977		if (err < 0)
978			goto out;
979		alen = err;
980
981		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
982		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
983			goto out;
984
985restart:
986		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
987		if (!other)
988			goto out;
989
990		unix_state_double_lock(sk, other);
991
992		/* Apparently VFS overslept socket death. Retry. */
993		if (sock_flag(other, SOCK_DEAD)) {
994			unix_state_double_unlock(sk, other);
995			sock_put(other);
996			goto restart;
997		}
998
999		err = -EPERM;
1000		if (!unix_may_send(sk, other))
1001			goto out_unlock;
1002
1003		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1004		if (err)
1005			goto out_unlock;
1006
1007	} else {
1008		/*
1009		 *	1003.1g breaking connected state with AF_UNSPEC
1010		 */
1011		other = NULL;
1012		unix_state_double_lock(sk, other);
1013	}
1014
1015	/*
1016	 * If it was connected, reconnect.
1017	 */
1018	if (unix_peer(sk)) {
1019		struct sock *old_peer = unix_peer(sk);
1020		unix_peer(sk) = other;
1021		unix_state_double_unlock(sk, other);
1022
1023		if (other != old_peer)
1024			unix_dgram_disconnected(sk, old_peer);
1025		sock_put(old_peer);
1026	} else {
1027		unix_peer(sk) = other;
1028		unix_state_double_unlock(sk, other);
1029	}
1030	return 0;
1031
1032out_unlock:
1033	unix_state_double_unlock(sk, other);
1034	sock_put(other);
1035out:
1036	return err;
1037}
1038
1039static long unix_wait_for_peer(struct sock *other, long timeo)
1040{
1041	struct unix_sock *u = unix_sk(other);
1042	int sched;
1043	DEFINE_WAIT(wait);
1044
1045	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1046
1047	sched = !sock_flag(other, SOCK_DEAD) &&
1048		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1049		unix_recvq_full(other);
1050
1051	unix_state_unlock(other);
1052
1053	if (sched)
1054		timeo = schedule_timeout(timeo);
1055
1056	finish_wait(&u->peer_wait, &wait);
1057	return timeo;
1058}
1059
1060static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1061			       int addr_len, int flags)
1062{
1063	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1064	struct sock *sk = sock->sk;
1065	struct net *net = sock_net(sk);
1066	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1067	struct sock *newsk = NULL;
1068	struct sock *other = NULL;
1069	struct sk_buff *skb = NULL;
1070	unsigned hash;
1071	int st;
1072	int err;
1073	long timeo;
1074
1075	err = unix_mkname(sunaddr, addr_len, &hash);
1076	if (err < 0)
1077		goto out;
1078	addr_len = err;
1079
1080	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1081	    (err = unix_autobind(sock)) != 0)
1082		goto out;
1083
1084	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1085
1086	/* First of all allocate resources.
1087	   If we will make it after state is locked,
1088	   we will have to recheck all again in any case.
1089	 */
1090
1091	err = -ENOMEM;
1092
1093	/* create new sock for complete connection */
1094	newsk = unix_create1(sock_net(sk), NULL);
1095	if (newsk == NULL)
1096		goto out;
1097
1098	/* Allocate skb for sending to listening sock */
1099	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1100	if (skb == NULL)
1101		goto out;
1102
1103restart:
1104	/*  Find listening sock. */
1105	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1106	if (!other)
1107		goto out;
1108
1109	/* Latch state of peer */
1110	unix_state_lock(other);
1111
1112	/* Apparently VFS overslept socket death. Retry. */
1113	if (sock_flag(other, SOCK_DEAD)) {
1114		unix_state_unlock(other);
1115		sock_put(other);
1116		goto restart;
1117	}
1118
1119	err = -ECONNREFUSED;
1120	if (other->sk_state != TCP_LISTEN)
1121		goto out_unlock;
1122	if (other->sk_shutdown & RCV_SHUTDOWN)
1123		goto out_unlock;
1124
1125	if (unix_recvq_full(other)) {
1126		err = -EAGAIN;
1127		if (!timeo)
1128			goto out_unlock;
1129
1130		timeo = unix_wait_for_peer(other, timeo);
1131
1132		err = sock_intr_errno(timeo);
1133		if (signal_pending(current))
1134			goto out;
1135		sock_put(other);
1136		goto restart;
1137	}
1138
1139	/* Latch our state.
1140
1141	   It is tricky place. We need to grab our state lock and cannot
1142	   drop lock on peer. It is dangerous because deadlock is
1143	   possible. Connect to self case and simultaneous
1144	   attempt to connect are eliminated by checking socket
1145	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1146	   check this before attempt to grab lock.
1147
1148	   Well, and we have to recheck the state after socket locked.
1149	 */
1150	st = sk->sk_state;
1151
1152	switch (st) {
1153	case TCP_CLOSE:
1154		/* This is ok... continue with connect */
1155		break;
1156	case TCP_ESTABLISHED:
1157		/* Socket is already connected */
1158		err = -EISCONN;
1159		goto out_unlock;
1160	default:
1161		err = -EINVAL;
1162		goto out_unlock;
1163	}
1164
1165	unix_state_lock_nested(sk);
1166
1167	if (sk->sk_state != st) {
1168		unix_state_unlock(sk);
1169		unix_state_unlock(other);
1170		sock_put(other);
1171		goto restart;
1172	}
1173
1174	err = security_unix_stream_connect(sk, other, newsk);
1175	if (err) {
1176		unix_state_unlock(sk);
1177		goto out_unlock;
1178	}
1179
1180	/* The way is open! Fastly set all the necessary fields... */
1181
1182	sock_hold(sk);
1183	unix_peer(newsk)	= sk;
1184	newsk->sk_state		= TCP_ESTABLISHED;
1185	newsk->sk_type		= sk->sk_type;
1186	init_peercred(newsk);
1187	newu = unix_sk(newsk);
1188	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1189	otheru = unix_sk(other);
1190
1191	/* copy address information from listening to new sock*/
1192	if (otheru->addr) {
1193		atomic_inc(&otheru->addr->refcnt);
1194		newu->addr = otheru->addr;
1195	}
1196	if (otheru->dentry) {
1197		newu->dentry	= dget(otheru->dentry);
1198		newu->mnt	= mntget(otheru->mnt);
1199	}
1200
1201	/* Set credentials */
1202	copy_peercred(sk, other);
1203
1204	sock->state	= SS_CONNECTED;
1205	sk->sk_state	= TCP_ESTABLISHED;
1206	sock_hold(newsk);
1207
1208	smp_mb__after_atomic_inc();	/* sock_hold() does an atomic_inc() */
1209	unix_peer(sk)	= newsk;
1210
1211	unix_state_unlock(sk);
1212
1213	/* take ten and and send info to listening sock */
1214	spin_lock(&other->sk_receive_queue.lock);
1215	__skb_queue_tail(&other->sk_receive_queue, skb);
1216	spin_unlock(&other->sk_receive_queue.lock);
1217	unix_state_unlock(other);
1218	other->sk_data_ready(other, 0);
1219	sock_put(other);
1220	return 0;
1221
1222out_unlock:
1223	if (other)
1224		unix_state_unlock(other);
1225
1226out:
1227	kfree_skb(skb);
1228	if (newsk)
1229		unix_release_sock(newsk, 0);
1230	if (other)
1231		sock_put(other);
1232	return err;
1233}
1234
1235static int unix_socketpair(struct socket *socka, struct socket *sockb)
1236{
1237	struct sock *ska = socka->sk, *skb = sockb->sk;
1238
1239	/* Join our sockets back to back */
1240	sock_hold(ska);
1241	sock_hold(skb);
1242	unix_peer(ska) = skb;
1243	unix_peer(skb) = ska;
1244	init_peercred(ska);
1245	init_peercred(skb);
1246
1247	if (ska->sk_type != SOCK_DGRAM) {
1248		ska->sk_state = TCP_ESTABLISHED;
1249		skb->sk_state = TCP_ESTABLISHED;
1250		socka->state  = SS_CONNECTED;
1251		sockb->state  = SS_CONNECTED;
1252	}
1253	return 0;
1254}
1255
1256static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1257{
1258	struct sock *sk = sock->sk;
1259	struct sock *tsk;
1260	struct sk_buff *skb;
1261	int err;
1262
1263	err = -EOPNOTSUPP;
1264	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1265		goto out;
1266
1267	err = -EINVAL;
1268	if (sk->sk_state != TCP_LISTEN)
1269		goto out;
1270
1271	/* If socket state is TCP_LISTEN it cannot change (for now...),
1272	 * so that no locks are necessary.
1273	 */
1274
1275	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1276	if (!skb) {
1277		/* This means receive shutdown. */
1278		if (err == 0)
1279			err = -EINVAL;
1280		goto out;
1281	}
1282
1283	tsk = skb->sk;
1284	skb_free_datagram(sk, skb);
1285	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1286
1287	/* attach accepted sock to socket */
1288	unix_state_lock(tsk);
1289	newsock->state = SS_CONNECTED;
1290	sock_graft(tsk, newsock);
1291	unix_state_unlock(tsk);
1292	return 0;
1293
1294out:
1295	return err;
1296}
1297
1298
1299static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1300{
1301	struct sock *sk = sock->sk;
1302	struct unix_sock *u;
1303	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1304	int err = 0;
1305
1306	if (peer) {
1307		sk = unix_peer_get(sk);
1308
1309		err = -ENOTCONN;
1310		if (!sk)
1311			goto out;
1312		err = 0;
1313	} else {
1314		sock_hold(sk);
1315	}
1316
1317	u = unix_sk(sk);
1318	unix_state_lock(sk);
1319	if (!u->addr) {
1320		sunaddr->sun_family = AF_UNIX;
1321		sunaddr->sun_path[0] = 0;
1322		*uaddr_len = sizeof(short);
1323	} else {
1324		struct unix_address *addr = u->addr;
1325
1326		*uaddr_len = addr->len;
1327		memcpy(sunaddr, addr->name, *uaddr_len);
1328	}
1329	unix_state_unlock(sk);
1330	sock_put(sk);
1331out:
1332	return err;
1333}
1334
1335static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1336{
1337	int i;
1338
1339	scm->fp = UNIXCB(skb).fp;
1340	UNIXCB(skb).fp = NULL;
1341
1342	for (i = scm->fp->count-1; i >= 0; i--)
1343		unix_notinflight(scm->fp->fp[i]);
1344}
1345
1346static void unix_destruct_scm(struct sk_buff *skb)
1347{
1348	struct scm_cookie scm;
1349	memset(&scm, 0, sizeof(scm));
1350	scm.pid  = UNIXCB(skb).pid;
1351	scm.cred = UNIXCB(skb).cred;
1352	if (UNIXCB(skb).fp)
1353		unix_detach_fds(&scm, skb);
1354
1355	/* Alas, it calls VFS */
1356	/* So fscking what? fput() had been SMP-safe since the last Summer */
1357	scm_destroy(&scm);
1358	sock_wfree(skb);
1359}
1360
1361#define MAX_RECURSION_LEVEL 4
1362
1363static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1364{
1365	int i;
1366	unsigned char max_level = 0;
1367	int unix_sock_count = 0;
1368
1369	for (i = scm->fp->count - 1; i >= 0; i--) {
1370		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1371
1372		if (sk) {
1373			unix_sock_count++;
1374			max_level = max(max_level,
1375					unix_sk(sk)->recursion_level);
1376		}
1377	}
1378	if (unlikely(max_level > MAX_RECURSION_LEVEL))
1379		return -ETOOMANYREFS;
1380
1381	/*
1382	 * Need to duplicate file references for the sake of garbage
1383	 * collection.  Otherwise a socket in the fps might become a
1384	 * candidate for GC while the skb is not yet queued.
1385	 */
1386	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1387	if (!UNIXCB(skb).fp)
1388		return -ENOMEM;
1389
1390	if (unix_sock_count) {
1391		for (i = scm->fp->count - 1; i >= 0; i--)
1392			unix_inflight(scm->fp->fp[i]);
1393	}
1394	return max_level;
1395}
1396
1397static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1398{
1399	int err = 0;
1400
1401	UNIXCB(skb).pid  = get_pid(scm->pid);
1402	if (scm->cred)
1403		UNIXCB(skb).cred = get_cred(scm->cred);
1404	UNIXCB(skb).fp = NULL;
1405	if (scm->fp && send_fds)
1406		err = unix_attach_fds(scm, skb);
1407
1408	skb->destructor = unix_destruct_scm;
1409	return err;
1410}
1411
1412/*
1413 * Some apps rely on write() giving SCM_CREDENTIALS
1414 * We include credentials if source or destination socket
1415 * asserted SOCK_PASSCRED.
1416 */
1417static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1418			    const struct sock *other)
1419{
1420	if (UNIXCB(skb).cred)
1421		return;
1422	if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1423	    !other->sk_socket ||
1424	    test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1425		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1426		UNIXCB(skb).cred = get_current_cred();
1427	}
1428}
1429
1430/*
1431 *	Send AF_UNIX data.
1432 */
1433
1434static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1435			      struct msghdr *msg, size_t len)
1436{
1437	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1438	struct sock *sk = sock->sk;
1439	struct net *net = sock_net(sk);
1440	struct unix_sock *u = unix_sk(sk);
1441	struct sockaddr_un *sunaddr = msg->msg_name;
1442	struct sock *other = NULL;
1443	int namelen = 0; /* fake GCC */
1444	int err;
1445	unsigned hash;
1446	struct sk_buff *skb;
1447	long timeo;
1448	struct scm_cookie tmp_scm;
1449	int max_level;
1450
1451	if (NULL == siocb->scm)
1452		siocb->scm = &tmp_scm;
1453	wait_for_unix_gc();
1454	err = scm_send(sock, msg, siocb->scm);
1455	if (err < 0)
1456		return err;
1457
1458	err = -EOPNOTSUPP;
1459	if (msg->msg_flags&MSG_OOB)
1460		goto out;
1461
1462	if (msg->msg_namelen) {
1463		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1464		if (err < 0)
1465			goto out;
1466		namelen = err;
1467	} else {
1468		sunaddr = NULL;
1469		err = -ENOTCONN;
1470		other = unix_peer_get(sk);
1471		if (!other)
1472			goto out;
1473	}
1474
1475	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1476	    && (err = unix_autobind(sock)) != 0)
1477		goto out;
1478
1479	err = -EMSGSIZE;
1480	if (len > sk->sk_sndbuf - 32)
1481		goto out;
1482
1483	skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err);
1484	if (skb == NULL)
1485		goto out;
1486
1487	err = unix_scm_to_skb(siocb->scm, skb, true);
1488	if (err < 0)
1489		goto out_free;
1490	max_level = err + 1;
1491	unix_get_secdata(siocb->scm, skb);
1492
1493	skb_reset_transport_header(skb);
1494	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1495	if (err)
1496		goto out_free;
1497
1498	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1499
1500restart:
1501	if (!other) {
1502		err = -ECONNRESET;
1503		if (sunaddr == NULL)
1504			goto out_free;
1505
1506		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1507					hash, &err);
1508		if (other == NULL)
1509			goto out_free;
1510	}
1511
1512	if (sk_filter(other, skb) < 0) {
1513		/* Toss the packet but do not return any error to the sender */
1514		err = len;
1515		goto out_free;
1516	}
1517
1518	unix_state_lock(other);
1519	err = -EPERM;
1520	if (!unix_may_send(sk, other))
1521		goto out_unlock;
1522
1523	if (sock_flag(other, SOCK_DEAD)) {
1524		/*
1525		 *	Check with 1003.1g - what should
1526		 *	datagram error
1527		 */
1528		unix_state_unlock(other);
1529		sock_put(other);
1530
1531		err = 0;
1532		unix_state_lock(sk);
1533		if (unix_peer(sk) == other) {
1534			unix_peer(sk) = NULL;
1535			unix_state_unlock(sk);
1536
1537			unix_dgram_disconnected(sk, other);
1538			sock_put(other);
1539			err = -ECONNREFUSED;
1540		} else {
1541			unix_state_unlock(sk);
1542		}
1543
1544		other = NULL;
1545		if (err)
1546			goto out_free;
1547		goto restart;
1548	}
1549
1550	err = -EPIPE;
1551	if (other->sk_shutdown & RCV_SHUTDOWN)
1552		goto out_unlock;
1553
1554	if (sk->sk_type != SOCK_SEQPACKET) {
1555		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1556		if (err)
1557			goto out_unlock;
1558	}
1559
1560	if (unix_peer(other) != sk && unix_recvq_full(other)) {
1561		if (!timeo) {
1562			err = -EAGAIN;
1563			goto out_unlock;
1564		}
1565
1566		timeo = unix_wait_for_peer(other, timeo);
1567
1568		err = sock_intr_errno(timeo);
1569		if (signal_pending(current))
1570			goto out_free;
1571
1572		goto restart;
1573	}
1574
1575	if (sock_flag(other, SOCK_RCVTSTAMP))
1576		__net_timestamp(skb);
1577	maybe_add_creds(skb, sock, other);
1578	skb_queue_tail(&other->sk_receive_queue, skb);
1579	if (max_level > unix_sk(other)->recursion_level)
1580		unix_sk(other)->recursion_level = max_level;
1581	unix_state_unlock(other);
1582	other->sk_data_ready(other, len);
1583	sock_put(other);
1584	scm_destroy(siocb->scm);
1585	return len;
1586
1587out_unlock:
1588	unix_state_unlock(other);
1589out_free:
1590	kfree_skb(skb);
1591out:
1592	if (other)
1593		sock_put(other);
1594	scm_destroy(siocb->scm);
1595	return err;
1596}
1597
1598
1599static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1600			       struct msghdr *msg, size_t len)
1601{
1602	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1603	struct sock *sk = sock->sk;
1604	struct sock *other = NULL;
1605	int err, size;
1606	struct sk_buff *skb;
1607	int sent = 0;
1608	struct scm_cookie tmp_scm;
1609	bool fds_sent = false;
1610	int max_level;
1611
1612	if (NULL == siocb->scm)
1613		siocb->scm = &tmp_scm;
1614	wait_for_unix_gc();
1615	err = scm_send(sock, msg, siocb->scm);
1616	if (err < 0)
1617		return err;
1618
1619	err = -EOPNOTSUPP;
1620	if (msg->msg_flags&MSG_OOB)
1621		goto out_err;
1622
1623	if (msg->msg_namelen) {
1624		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1625		goto out_err;
1626	} else {
1627		err = -ENOTCONN;
1628		other = unix_peer(sk);
1629		if (!other)
1630			goto out_err;
1631	}
1632
1633	if (sk->sk_shutdown & SEND_SHUTDOWN)
1634		goto pipe_err;
1635
1636	while (sent < len) {
1637		/*
1638		 *	Optimisation for the fact that under 0.01% of X
1639		 *	messages typically need breaking up.
1640		 */
1641
1642		size = len-sent;
1643
1644		/* Keep two messages in the pipe so it schedules better */
1645		if (size > ((sk->sk_sndbuf >> 1) - 64))
1646			size = (sk->sk_sndbuf >> 1) - 64;
1647
1648		if (size > SKB_MAX_ALLOC)
1649			size = SKB_MAX_ALLOC;
1650
1651		/*
1652		 *	Grab a buffer
1653		 */
1654
1655		skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT,
1656					  &err);
1657
1658		if (skb == NULL)
1659			goto out_err;
1660
1661		/*
1662		 *	If you pass two values to the sock_alloc_send_skb
1663		 *	it tries to grab the large buffer with GFP_NOFS
1664		 *	(which can fail easily), and if it fails grab the
1665		 *	fallback size buffer which is under a page and will
1666		 *	succeed. [Alan]
1667		 */
1668		size = min_t(int, size, skb_tailroom(skb));
1669
1670
1671		/* Only send the fds in the first buffer */
1672		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
1673		if (err < 0) {
1674			kfree_skb(skb);
1675			goto out_err;
1676		}
1677		max_level = err + 1;
1678		fds_sent = true;
1679
1680		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
1681		if (err) {
1682			kfree_skb(skb);
1683			goto out_err;
1684		}
1685
1686		unix_state_lock(other);
1687
1688		if (sock_flag(other, SOCK_DEAD) ||
1689		    (other->sk_shutdown & RCV_SHUTDOWN))
1690			goto pipe_err_free;
1691
1692		maybe_add_creds(skb, sock, other);
1693		skb_queue_tail(&other->sk_receive_queue, skb);
1694		if (max_level > unix_sk(other)->recursion_level)
1695			unix_sk(other)->recursion_level = max_level;
1696		unix_state_unlock(other);
1697		other->sk_data_ready(other, size);
1698		sent += size;
1699	}
1700
1701	scm_destroy(siocb->scm);
1702	siocb->scm = NULL;
1703
1704	return sent;
1705
1706pipe_err_free:
1707	unix_state_unlock(other);
1708	kfree_skb(skb);
1709pipe_err:
1710	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1711		send_sig(SIGPIPE, current, 0);
1712	err = -EPIPE;
1713out_err:
1714	scm_destroy(siocb->scm);
1715	siocb->scm = NULL;
1716	return sent ? : err;
1717}
1718
1719static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1720				  struct msghdr *msg, size_t len)
1721{
1722	int err;
1723	struct sock *sk = sock->sk;
1724
1725	err = sock_error(sk);
1726	if (err)
1727		return err;
1728
1729	if (sk->sk_state != TCP_ESTABLISHED)
1730		return -ENOTCONN;
1731
1732	if (msg->msg_namelen)
1733		msg->msg_namelen = 0;
1734
1735	return unix_dgram_sendmsg(kiocb, sock, msg, len);
1736}
1737
1738static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
1739			      struct msghdr *msg, size_t size,
1740			      int flags)
1741{
1742	struct sock *sk = sock->sk;
1743
1744	if (sk->sk_state != TCP_ESTABLISHED)
1745		return -ENOTCONN;
1746
1747	return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
1748}
1749
1750static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1751{
1752	struct unix_sock *u = unix_sk(sk);
1753
1754	msg->msg_namelen = 0;
1755	if (u->addr) {
1756		msg->msg_namelen = u->addr->len;
1757		memcpy(msg->msg_name, u->addr->name, u->addr->len);
1758	}
1759}
1760
1761static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1762			      struct msghdr *msg, size_t size,
1763			      int flags)
1764{
1765	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1766	struct scm_cookie tmp_scm;
1767	struct sock *sk = sock->sk;
1768	struct unix_sock *u = unix_sk(sk);
1769	int noblock = flags & MSG_DONTWAIT;
1770	struct sk_buff *skb;
1771	int err;
1772	int peeked, skip;
1773
1774	err = -EOPNOTSUPP;
1775	if (flags&MSG_OOB)
1776		goto out;
1777
1778	msg->msg_namelen = 0;
1779
1780	err = mutex_lock_interruptible(&u->readlock);
1781	if (err) {
1782		err = sock_intr_errno(sock_rcvtimeo(sk, noblock));
1783		goto out;
1784	}
1785
1786	skip = sk_peek_offset(sk, flags);
1787
1788	skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1789	if (!skb) {
1790		unix_state_lock(sk);
1791		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1792		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1793		    (sk->sk_shutdown & RCV_SHUTDOWN))
1794			err = 0;
1795		unix_state_unlock(sk);
1796		goto out_unlock;
1797	}
1798
1799	wake_up_interruptible_sync_poll(&u->peer_wait,
1800					POLLOUT | POLLWRNORM | POLLWRBAND);
1801
1802	if (msg->msg_name)
1803		unix_copy_addr(msg, skb->sk);
1804
1805	if (size > skb->len - skip)
1806		size = skb->len - skip;
1807	else if (size < skb->len - skip)
1808		msg->msg_flags |= MSG_TRUNC;
1809
1810	err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
1811	if (err)
1812		goto out_free;
1813
1814	if (sock_flag(sk, SOCK_RCVTSTAMP))
1815		__sock_recv_timestamp(msg, sk, skb);
1816
1817	if (!siocb->scm) {
1818		siocb->scm = &tmp_scm;
1819		memset(&tmp_scm, 0, sizeof(tmp_scm));
1820	}
1821	scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
1822	unix_set_secdata(siocb->scm, skb);
1823
1824	if (!(flags & MSG_PEEK)) {
1825		if (UNIXCB(skb).fp)
1826			unix_detach_fds(siocb->scm, skb);
1827
1828		sk_peek_offset_bwd(sk, skb->len);
1829	} else {
1830		/* It is questionable: on PEEK we could:
1831		   - do not return fds - good, but too simple 8)
1832		   - return fds, and do not return them on read (old strategy,
1833		     apparently wrong)
1834		   - clone fds (I chose it for now, it is the most universal
1835		     solution)
1836
1837		   POSIX 1003.1g does not actually define this clearly
1838		   at all. POSIX 1003.1g doesn't define a lot of things
1839		   clearly however!
1840
1841		*/
1842
1843		sk_peek_offset_fwd(sk, size);
1844
1845		if (UNIXCB(skb).fp)
1846			siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1847	}
1848	err = size;
1849
1850	scm_recv(sock, msg, siocb->scm, flags);
1851
1852out_free:
1853	skb_free_datagram(sk, skb);
1854out_unlock:
1855	mutex_unlock(&u->readlock);
1856out:
1857	return err;
1858}
1859
1860/*
1861 *	Sleep until data has arrive. But check for races..
1862 */
1863
1864static long unix_stream_data_wait(struct sock *sk, long timeo)
1865{
1866	DEFINE_WAIT(wait);
1867
1868	unix_state_lock(sk);
1869
1870	for (;;) {
1871		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1872
1873		if (!skb_queue_empty(&sk->sk_receive_queue) ||
1874		    sk->sk_err ||
1875		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1876		    signal_pending(current) ||
1877		    !timeo)
1878			break;
1879
1880		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1881		unix_state_unlock(sk);
1882		timeo = schedule_timeout(timeo);
1883		unix_state_lock(sk);
1884		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1885	}
1886
1887	finish_wait(sk_sleep(sk), &wait);
1888	unix_state_unlock(sk);
1889	return timeo;
1890}
1891
1892
1893
1894static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1895			       struct msghdr *msg, size_t size,
1896			       int flags)
1897{
1898	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1899	struct scm_cookie tmp_scm;
1900	struct sock *sk = sock->sk;
1901	struct unix_sock *u = unix_sk(sk);
1902	struct sockaddr_un *sunaddr = msg->msg_name;
1903	int copied = 0;
1904	int check_creds = 0;
1905	int target;
1906	int err = 0;
1907	long timeo;
1908	int skip;
1909
1910	err = -EINVAL;
1911	if (sk->sk_state != TCP_ESTABLISHED)
1912		goto out;
1913
1914	err = -EOPNOTSUPP;
1915	if (flags&MSG_OOB)
1916		goto out;
1917
1918	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1919	timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1920
1921	msg->msg_namelen = 0;
1922
1923	/* Lock the socket to prevent queue disordering
1924	 * while sleeps in memcpy_tomsg
1925	 */
1926
1927	if (!siocb->scm) {
1928		siocb->scm = &tmp_scm;
1929		memset(&tmp_scm, 0, sizeof(tmp_scm));
1930	}
1931
1932	err = mutex_lock_interruptible(&u->readlock);
1933	if (err) {
1934		err = sock_intr_errno(timeo);
1935		goto out;
1936	}
1937
1938	skip = sk_peek_offset(sk, flags);
1939
1940	do {
1941		int chunk;
1942		struct sk_buff *skb;
1943
1944		unix_state_lock(sk);
1945		skb = skb_peek(&sk->sk_receive_queue);
1946again:
1947		if (skb == NULL) {
1948			unix_sk(sk)->recursion_level = 0;
1949			if (copied >= target)
1950				goto unlock;
1951
1952			/*
1953			 *	POSIX 1003.1g mandates this order.
1954			 */
1955
1956			err = sock_error(sk);
1957			if (err)
1958				goto unlock;
1959			if (sk->sk_shutdown & RCV_SHUTDOWN)
1960				goto unlock;
1961
1962			unix_state_unlock(sk);
1963			err = -EAGAIN;
1964			if (!timeo)
1965				break;
1966			mutex_unlock(&u->readlock);
1967
1968			timeo = unix_stream_data_wait(sk, timeo);
1969
1970			if (signal_pending(current)
1971			    ||  mutex_lock_interruptible(&u->readlock)) {
1972				err = sock_intr_errno(timeo);
1973				goto out;
1974			}
1975
1976			continue;
1977 unlock:
1978			unix_state_unlock(sk);
1979			break;
1980		}
1981
1982		if (skip >= skb->len) {
1983			skip -= skb->len;
1984			skb = skb_peek_next(skb, &sk->sk_receive_queue);
1985			goto again;
1986		}
1987
1988		unix_state_unlock(sk);
1989
1990		if (check_creds) {
1991			/* Never glue messages from different writers */
1992			if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
1993			    (UNIXCB(skb).cred != siocb->scm->cred))
1994				break;
1995		} else {
1996			/* Copy credentials */
1997			scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
1998			check_creds = 1;
1999		}
2000
2001		/* Copy address just once */
2002		if (sunaddr) {
2003			unix_copy_addr(msg, skb->sk);
2004			sunaddr = NULL;
2005		}
2006
2007		chunk = min_t(unsigned int, skb->len - skip, size);
2008		if (memcpy_toiovec(msg->msg_iov, skb->data + skip, chunk)) {
2009			if (copied == 0)
2010				copied = -EFAULT;
2011			break;
2012		}
2013		copied += chunk;
2014		size -= chunk;
2015
2016		/* Mark read part of skb as used */
2017		if (!(flags & MSG_PEEK)) {
2018			skb_pull(skb, chunk);
2019
2020			sk_peek_offset_bwd(sk, chunk);
2021
2022			if (UNIXCB(skb).fp)
2023				unix_detach_fds(siocb->scm, skb);
2024
2025			if (skb->len)
2026				break;
2027
2028			skb_unlink(skb, &sk->sk_receive_queue);
2029			consume_skb(skb);
2030
2031			if (siocb->scm->fp)
2032				break;
2033		} else {
2034			/* It is questionable, see note in unix_dgram_recvmsg.
2035			 */
2036			if (UNIXCB(skb).fp)
2037				siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2038
2039			sk_peek_offset_fwd(sk, chunk);
2040
2041			break;
2042		}
2043	} while (size);
2044
2045	mutex_unlock(&u->readlock);
2046	scm_recv(sock, msg, siocb->scm, flags);
2047out:
2048	return copied ? : err;
2049}
2050
2051static int unix_shutdown(struct socket *sock, int mode)
2052{
2053	struct sock *sk = sock->sk;
2054	struct sock *other;
2055
2056	mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
2057
2058	if (!mode)
2059		return 0;
2060
2061	unix_state_lock(sk);
2062	sk->sk_shutdown |= mode;
2063	other = unix_peer(sk);
2064	if (other)
2065		sock_hold(other);
2066	unix_state_unlock(sk);
2067	sk->sk_state_change(sk);
2068
2069	if (other &&
2070		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2071
2072		int peer_mode = 0;
2073
2074		if (mode&RCV_SHUTDOWN)
2075			peer_mode |= SEND_SHUTDOWN;
2076		if (mode&SEND_SHUTDOWN)
2077			peer_mode |= RCV_SHUTDOWN;
2078		unix_state_lock(other);
2079		other->sk_shutdown |= peer_mode;
2080		unix_state_unlock(other);
2081		other->sk_state_change(other);
2082		if (peer_mode == SHUTDOWN_MASK)
2083			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2084		else if (peer_mode & RCV_SHUTDOWN)
2085			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2086	}
2087	if (other)
2088		sock_put(other);
2089
2090	return 0;
2091}
2092
2093long unix_inq_len(struct sock *sk)
2094{
2095	struct sk_buff *skb;
2096	long amount = 0;
2097
2098	if (sk->sk_state == TCP_LISTEN)
2099		return -EINVAL;
2100
2101	spin_lock(&sk->sk_receive_queue.lock);
2102	if (sk->sk_type == SOCK_STREAM ||
2103	    sk->sk_type == SOCK_SEQPACKET) {
2104		skb_queue_walk(&sk->sk_receive_queue, skb)
2105			amount += skb->len;
2106	} else {
2107		skb = skb_peek(&sk->sk_receive_queue);
2108		if (skb)
2109			amount = skb->len;
2110	}
2111	spin_unlock(&sk->sk_receive_queue.lock);
2112
2113	return amount;
2114}
2115EXPORT_SYMBOL_GPL(unix_inq_len);
2116
2117long unix_outq_len(struct sock *sk)
2118{
2119	return sk_wmem_alloc_get(sk);
2120}
2121EXPORT_SYMBOL_GPL(unix_outq_len);
2122
2123static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2124{
2125	struct sock *sk = sock->sk;
2126	long amount = 0;
2127	int err;
2128
2129	switch (cmd) {
2130	case SIOCOUTQ:
2131		amount = unix_outq_len(sk);
2132		err = put_user(amount, (int __user *)arg);
2133		break;
2134	case SIOCINQ:
2135		amount = unix_inq_len(sk);
2136		if (amount < 0)
2137			err = amount;
2138		else
2139			err = put_user(amount, (int __user *)arg);
2140		break;
2141	default:
2142		err = -ENOIOCTLCMD;
2143		break;
2144	}
2145	return err;
2146}
2147
2148static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2149{
2150	struct sock *sk = sock->sk;
2151	unsigned int mask;
2152
2153	sock_poll_wait(file, sk_sleep(sk), wait);
2154	mask = 0;
2155
2156	/* exceptional events? */
2157	if (sk->sk_err)
2158		mask |= POLLERR;
2159	if (sk->sk_shutdown == SHUTDOWN_MASK)
2160		mask |= POLLHUP;
2161	if (sk->sk_shutdown & RCV_SHUTDOWN)
2162		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2163
2164	/* readable? */
2165	if (!skb_queue_empty(&sk->sk_receive_queue))
2166		mask |= POLLIN | POLLRDNORM;
2167
2168	/* Connection-based need to check for termination and startup */
2169	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2170	    sk->sk_state == TCP_CLOSE)
2171		mask |= POLLHUP;
2172
2173	/*
2174	 * we set writable also when the other side has shut down the
2175	 * connection. This prevents stuck sockets.
2176	 */
2177	if (unix_writable(sk))
2178		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2179
2180	return mask;
2181}
2182
2183static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2184				    poll_table *wait)
2185{
2186	struct sock *sk = sock->sk, *other;
2187	unsigned int mask, writable;
2188
2189	sock_poll_wait(file, sk_sleep(sk), wait);
2190	mask = 0;
2191
2192	/* exceptional events? */
2193	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2194		mask |= POLLERR;
2195	if (sk->sk_shutdown & RCV_SHUTDOWN)
2196		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2197	if (sk->sk_shutdown == SHUTDOWN_MASK)
2198		mask |= POLLHUP;
2199
2200	/* readable? */
2201	if (!skb_queue_empty(&sk->sk_receive_queue))
2202		mask |= POLLIN | POLLRDNORM;
2203
2204	/* Connection-based need to check for termination and startup */
2205	if (sk->sk_type == SOCK_SEQPACKET) {
2206		if (sk->sk_state == TCP_CLOSE)
2207			mask |= POLLHUP;
2208		/* connection hasn't started yet? */
2209		if (sk->sk_state == TCP_SYN_SENT)
2210			return mask;
2211	}
2212
2213	/* No write status requested, avoid expensive OUT tests. */
2214	if (wait && !(wait->key & (POLLWRBAND | POLLWRNORM | POLLOUT)))
2215		return mask;
2216
2217	writable = unix_writable(sk);
2218	other = unix_peer_get(sk);
2219	if (other) {
2220		if (unix_peer(other) != sk) {
2221			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2222			if (unix_recvq_full(other))
2223				writable = 0;
2224		}
2225		sock_put(other);
2226	}
2227
2228	if (writable)
2229		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2230	else
2231		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2232
2233	return mask;
2234}
2235
2236#ifdef CONFIG_PROC_FS
2237static struct sock *first_unix_socket(int *i)
2238{
2239	for (*i = 0; *i <= UNIX_HASH_SIZE; (*i)++) {
2240		if (!hlist_empty(&unix_socket_table[*i]))
2241			return __sk_head(&unix_socket_table[*i]);
2242	}
2243	return NULL;
2244}
2245
2246static struct sock *next_unix_socket(int *i, struct sock *s)
2247{
2248	struct sock *next = sk_next(s);
2249	/* More in this chain? */
2250	if (next)
2251		return next;
2252	/* Look for next non-empty chain. */
2253	for ((*i)++; *i <= UNIX_HASH_SIZE; (*i)++) {
2254		if (!hlist_empty(&unix_socket_table[*i]))
2255			return __sk_head(&unix_socket_table[*i]);
2256	}
2257	return NULL;
2258}
2259
2260struct unix_iter_state {
2261	struct seq_net_private p;
2262	int i;
2263};
2264
2265static struct sock *unix_seq_idx(struct seq_file *seq, loff_t pos)
2266{
2267	struct unix_iter_state *iter = seq->private;
2268	loff_t off = 0;
2269	struct sock *s;
2270
2271	for (s = first_unix_socket(&iter->i); s; s = next_unix_socket(&iter->i, s)) {
2272		if (sock_net(s) != seq_file_net(seq))
2273			continue;
2274		if (off == pos)
2275			return s;
2276		++off;
2277	}
2278	return NULL;
2279}
2280
2281static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2282	__acquires(unix_table_lock)
2283{
2284	spin_lock(&unix_table_lock);
2285	return *pos ? unix_seq_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2286}
2287
2288static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2289{
2290	struct unix_iter_state *iter = seq->private;
2291	struct sock *sk = v;
2292	++*pos;
2293
2294	if (v == SEQ_START_TOKEN)
2295		sk = first_unix_socket(&iter->i);
2296	else
2297		sk = next_unix_socket(&iter->i, sk);
2298	while (sk && (sock_net(sk) != seq_file_net(seq)))
2299		sk = next_unix_socket(&iter->i, sk);
2300	return sk;
2301}
2302
2303static void unix_seq_stop(struct seq_file *seq, void *v)
2304	__releases(unix_table_lock)
2305{
2306	spin_unlock(&unix_table_lock);
2307}
2308
2309static int unix_seq_show(struct seq_file *seq, void *v)
2310{
2311
2312	if (v == SEQ_START_TOKEN)
2313		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2314			 "Inode Path\n");
2315	else {
2316		struct sock *s = v;
2317		struct unix_sock *u = unix_sk(s);
2318		unix_state_lock(s);
2319
2320		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2321			s,
2322			atomic_read(&s->sk_refcnt),
2323			0,
2324			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2325			s->sk_type,
2326			s->sk_socket ?
2327			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2328			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2329			sock_i_ino(s));
2330
2331		if (u->addr) {
2332			int i, len;
2333			seq_putc(seq, ' ');
2334
2335			i = 0;
2336			len = u->addr->len - sizeof(short);
2337			if (!UNIX_ABSTRACT(s))
2338				len--;
2339			else {
2340				seq_putc(seq, '@');
2341				i++;
2342			}
2343			for ( ; i < len; i++)
2344				seq_putc(seq, u->addr->name->sun_path[i]);
2345		}
2346		unix_state_unlock(s);
2347		seq_putc(seq, '\n');
2348	}
2349
2350	return 0;
2351}
2352
2353static const struct seq_operations unix_seq_ops = {
2354	.start  = unix_seq_start,
2355	.next   = unix_seq_next,
2356	.stop   = unix_seq_stop,
2357	.show   = unix_seq_show,
2358};
2359
2360static int unix_seq_open(struct inode *inode, struct file *file)
2361{
2362	return seq_open_net(inode, file, &unix_seq_ops,
2363			    sizeof(struct unix_iter_state));
2364}
2365
2366static const struct file_operations unix_seq_fops = {
2367	.owner		= THIS_MODULE,
2368	.open		= unix_seq_open,
2369	.read		= seq_read,
2370	.llseek		= seq_lseek,
2371	.release	= seq_release_net,
2372};
2373
2374#endif
2375
2376static const struct net_proto_family unix_family_ops = {
2377	.family = PF_UNIX,
2378	.create = unix_create,
2379	.owner	= THIS_MODULE,
2380};
2381
2382
2383static int __net_init unix_net_init(struct net *net)
2384{
2385	int error = -ENOMEM;
2386
2387	net->unx.sysctl_max_dgram_qlen = 10;
2388	if (unix_sysctl_register(net))
2389		goto out;
2390
2391#ifdef CONFIG_PROC_FS
2392	if (!proc_net_fops_create(net, "unix", 0, &unix_seq_fops)) {
2393		unix_sysctl_unregister(net);
2394		goto out;
2395	}
2396#endif
2397	error = 0;
2398out:
2399	return error;
2400}
2401
2402static void __net_exit unix_net_exit(struct net *net)
2403{
2404	unix_sysctl_unregister(net);
2405	proc_net_remove(net, "unix");
2406}
2407
2408static struct pernet_operations unix_net_ops = {
2409	.init = unix_net_init,
2410	.exit = unix_net_exit,
2411};
2412
2413static int __init af_unix_init(void)
2414{
2415	int rc = -1;
2416	struct sk_buff *dummy_skb;
2417
2418	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb));
2419
2420	rc = proto_register(&unix_proto, 1);
2421	if (rc != 0) {
2422		printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2423		       __func__);
2424		goto out;
2425	}
2426
2427	sock_register(&unix_family_ops);
2428	register_pernet_subsys(&unix_net_ops);
2429out:
2430	return rc;
2431}
2432
2433static void __exit af_unix_exit(void)
2434{
2435	sock_unregister(PF_UNIX);
2436	proto_unregister(&unix_proto);
2437	unregister_pernet_subsys(&unix_net_ops);
2438}
2439
2440/* Earlier than device_initcall() so that other drivers invoking
2441   request_module() don't end up in a loop when modprobe tries
2442   to use a UNIX socket. But later than subsys_initcall() because
2443   we depend on stuff initialised there */
2444fs_initcall(af_unix_init);
2445module_exit(af_unix_exit);
2446
2447MODULE_LICENSE("GPL");
2448MODULE_ALIAS_NETPROTO(PF_UNIX);
2449