af_unix.c revision ded34e0fe8fe8c2d595bfa30626654e4b87621e0
1/*
2 * NET4:	Implementation of BSD Unix domain sockets.
3 *
4 * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
5 *
6 *		This program is free software; you can redistribute it and/or
7 *		modify it under the terms of the GNU General Public License
8 *		as published by the Free Software Foundation; either version
9 *		2 of the License, or (at your option) any later version.
10 *
11 * Fixes:
12 *		Linus Torvalds	:	Assorted bug cures.
13 *		Niibe Yutaka	:	async I/O support.
14 *		Carsten Paeth	:	PF_UNIX check, address fixes.
15 *		Alan Cox	:	Limit size of allocated blocks.
16 *		Alan Cox	:	Fixed the stupid socketpair bug.
17 *		Alan Cox	:	BSD compatibility fine tuning.
18 *		Alan Cox	:	Fixed a bug in connect when interrupted.
19 *		Alan Cox	:	Sorted out a proper draft version of
20 *					file descriptor passing hacked up from
21 *					Mike Shaver's work.
22 *		Marty Leisner	:	Fixes to fd passing
23 *		Nick Nevin	:	recvmsg bugfix.
24 *		Alan Cox	:	Started proper garbage collector
25 *		Heiko EiBfeldt	:	Missing verify_area check
26 *		Alan Cox	:	Started POSIXisms
27 *		Andreas Schwab	:	Replace inode by dentry for proper
28 *					reference counting
29 *		Kirk Petersen	:	Made this a module
30 *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
31 *					Lots of bug fixes.
32 *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
33 *					by above two patches.
34 *	     Andrea Arcangeli	:	If possible we block in connect(2)
35 *					if the max backlog of the listen socket
36 *					is been reached. This won't break
37 *					old apps and it will avoid huge amount
38 *					of socks hashed (this for unix_gc()
39 *					performances reasons).
40 *					Security fix that limits the max
41 *					number of socks to 2*max_files and
42 *					the number of skb queueable in the
43 *					dgram receiver.
44 *		Artur Skawina   :	Hash function optimizations
45 *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
46 *	      Malcolm Beattie   :	Set peercred for socketpair
47 *	     Michal Ostrowski   :       Module initialization cleanup.
48 *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
49 *	     				the core infrastructure is doing that
50 *	     				for all net proto families now (2.5.69+)
51 *
52 *
53 * Known differences from reference BSD that was tested:
54 *
55 *	[TO FIX]
56 *	ECONNREFUSED is not returned from one end of a connected() socket to the
57 *		other the moment one end closes.
58 *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
59 *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
60 *	[NOT TO FIX]
61 *	accept() returns a path name even if the connecting socket has closed
62 *		in the meantime (BSD loses the path and gives up).
63 *	accept() returns 0 length path for an unbound connector. BSD returns 16
64 *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65 *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
66 *	BSD af_unix apparently has connect forgetting to block properly.
67 *		(need to check this with the POSIX spec in detail)
68 *
69 * Differences from 2.0.0-11-... (ANK)
70 *	Bug fixes and improvements.
71 *		- client shutdown killed server socket.
72 *		- removed all useless cli/sti pairs.
73 *
74 *	Semantic changes/extensions.
75 *		- generic control message passing.
76 *		- SCM_CREDENTIALS control message.
77 *		- "Abstract" (not FS based) socket bindings.
78 *		  Abstract names are sequences of bytes (not zero terminated)
79 *		  started by 0, so that this name space does not intersect
80 *		  with BSD names.
81 */
82
83#include <linux/module.h>
84#include <linux/kernel.h>
85#include <linux/signal.h>
86#include <linux/sched.h>
87#include <linux/errno.h>
88#include <linux/string.h>
89#include <linux/stat.h>
90#include <linux/dcache.h>
91#include <linux/namei.h>
92#include <linux/socket.h>
93#include <linux/un.h>
94#include <linux/fcntl.h>
95#include <linux/termios.h>
96#include <linux/sockios.h>
97#include <linux/net.h>
98#include <linux/in.h>
99#include <linux/fs.h>
100#include <linux/slab.h>
101#include <asm/uaccess.h>
102#include <linux/skbuff.h>
103#include <linux/netdevice.h>
104#include <net/net_namespace.h>
105#include <net/sock.h>
106#include <net/tcp_states.h>
107#include <net/af_unix.h>
108#include <linux/proc_fs.h>
109#include <linux/seq_file.h>
110#include <net/scm.h>
111#include <linux/init.h>
112#include <linux/poll.h>
113#include <linux/rtnetlink.h>
114#include <linux/mount.h>
115#include <net/checksum.h>
116#include <linux/security.h>
117
118struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
119EXPORT_SYMBOL_GPL(unix_socket_table);
120DEFINE_SPINLOCK(unix_table_lock);
121EXPORT_SYMBOL_GPL(unix_table_lock);
122static atomic_long_t unix_nr_socks;
123
124
125static struct hlist_head *unix_sockets_unbound(void *addr)
126{
127	unsigned long hash = (unsigned long)addr;
128
129	hash ^= hash >> 16;
130	hash ^= hash >> 8;
131	hash %= UNIX_HASH_SIZE;
132	return &unix_socket_table[UNIX_HASH_SIZE + hash];
133}
134
135#define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
136
137#ifdef CONFIG_SECURITY_NETWORK
138static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
139{
140	memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
141}
142
143static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
144{
145	scm->secid = *UNIXSID(skb);
146}
147#else
148static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
149{ }
150
151static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
152{ }
153#endif /* CONFIG_SECURITY_NETWORK */
154
155/*
156 *  SMP locking strategy:
157 *    hash table is protected with spinlock unix_table_lock
158 *    each socket state is protected by separate spin lock.
159 */
160
161static inline unsigned int unix_hash_fold(__wsum n)
162{
163	unsigned int hash = (__force unsigned int)n;
164
165	hash ^= hash>>16;
166	hash ^= hash>>8;
167	return hash&(UNIX_HASH_SIZE-1);
168}
169
170#define unix_peer(sk) (unix_sk(sk)->peer)
171
172static inline int unix_our_peer(struct sock *sk, struct sock *osk)
173{
174	return unix_peer(osk) == sk;
175}
176
177static inline int unix_may_send(struct sock *sk, struct sock *osk)
178{
179	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
180}
181
182static inline int unix_recvq_full(struct sock const *sk)
183{
184	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
185}
186
187struct sock *unix_peer_get(struct sock *s)
188{
189	struct sock *peer;
190
191	unix_state_lock(s);
192	peer = unix_peer(s);
193	if (peer)
194		sock_hold(peer);
195	unix_state_unlock(s);
196	return peer;
197}
198EXPORT_SYMBOL_GPL(unix_peer_get);
199
200static inline void unix_release_addr(struct unix_address *addr)
201{
202	if (atomic_dec_and_test(&addr->refcnt))
203		kfree(addr);
204}
205
206/*
207 *	Check unix socket name:
208 *		- should be not zero length.
209 *	        - if started by not zero, should be NULL terminated (FS object)
210 *		- if started by zero, it is abstract name.
211 */
212
213static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
214{
215	if (len <= sizeof(short) || len > sizeof(*sunaddr))
216		return -EINVAL;
217	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
218		return -EINVAL;
219	if (sunaddr->sun_path[0]) {
220		/*
221		 * This may look like an off by one error but it is a bit more
222		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
223		 * sun_path[108] doesn't as such exist.  However in kernel space
224		 * we are guaranteed that it is a valid memory location in our
225		 * kernel address buffer.
226		 */
227		((char *)sunaddr)[len] = 0;
228		len = strlen(sunaddr->sun_path)+1+sizeof(short);
229		return len;
230	}
231
232	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
233	return len;
234}
235
236static void __unix_remove_socket(struct sock *sk)
237{
238	sk_del_node_init(sk);
239}
240
241static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
242{
243	WARN_ON(!sk_unhashed(sk));
244	sk_add_node(sk, list);
245}
246
247static inline void unix_remove_socket(struct sock *sk)
248{
249	spin_lock(&unix_table_lock);
250	__unix_remove_socket(sk);
251	spin_unlock(&unix_table_lock);
252}
253
254static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
255{
256	spin_lock(&unix_table_lock);
257	__unix_insert_socket(list, sk);
258	spin_unlock(&unix_table_lock);
259}
260
261static struct sock *__unix_find_socket_byname(struct net *net,
262					      struct sockaddr_un *sunname,
263					      int len, int type, unsigned int hash)
264{
265	struct sock *s;
266
267	sk_for_each(s, &unix_socket_table[hash ^ type]) {
268		struct unix_sock *u = unix_sk(s);
269
270		if (!net_eq(sock_net(s), net))
271			continue;
272
273		if (u->addr->len == len &&
274		    !memcmp(u->addr->name, sunname, len))
275			goto found;
276	}
277	s = NULL;
278found:
279	return s;
280}
281
282static inline struct sock *unix_find_socket_byname(struct net *net,
283						   struct sockaddr_un *sunname,
284						   int len, int type,
285						   unsigned int hash)
286{
287	struct sock *s;
288
289	spin_lock(&unix_table_lock);
290	s = __unix_find_socket_byname(net, sunname, len, type, hash);
291	if (s)
292		sock_hold(s);
293	spin_unlock(&unix_table_lock);
294	return s;
295}
296
297static struct sock *unix_find_socket_byinode(struct inode *i)
298{
299	struct sock *s;
300
301	spin_lock(&unix_table_lock);
302	sk_for_each(s,
303		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
304		struct dentry *dentry = unix_sk(s)->path.dentry;
305
306		if (dentry && dentry->d_inode == i) {
307			sock_hold(s);
308			goto found;
309		}
310	}
311	s = NULL;
312found:
313	spin_unlock(&unix_table_lock);
314	return s;
315}
316
317static inline int unix_writable(struct sock *sk)
318{
319	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
320}
321
322static void unix_write_space(struct sock *sk)
323{
324	struct socket_wq *wq;
325
326	rcu_read_lock();
327	if (unix_writable(sk)) {
328		wq = rcu_dereference(sk->sk_wq);
329		if (wq_has_sleeper(wq))
330			wake_up_interruptible_sync_poll(&wq->wait,
331				POLLOUT | POLLWRNORM | POLLWRBAND);
332		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
333	}
334	rcu_read_unlock();
335}
336
337/* When dgram socket disconnects (or changes its peer), we clear its receive
338 * queue of packets arrived from previous peer. First, it allows to do
339 * flow control based only on wmem_alloc; second, sk connected to peer
340 * may receive messages only from that peer. */
341static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
342{
343	if (!skb_queue_empty(&sk->sk_receive_queue)) {
344		skb_queue_purge(&sk->sk_receive_queue);
345		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
346
347		/* If one link of bidirectional dgram pipe is disconnected,
348		 * we signal error. Messages are lost. Do not make this,
349		 * when peer was not connected to us.
350		 */
351		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
352			other->sk_err = ECONNRESET;
353			other->sk_error_report(other);
354		}
355	}
356}
357
358static void unix_sock_destructor(struct sock *sk)
359{
360	struct unix_sock *u = unix_sk(sk);
361
362	skb_queue_purge(&sk->sk_receive_queue);
363
364	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
365	WARN_ON(!sk_unhashed(sk));
366	WARN_ON(sk->sk_socket);
367	if (!sock_flag(sk, SOCK_DEAD)) {
368		printk(KERN_INFO "Attempt to release alive unix socket: %p\n", sk);
369		return;
370	}
371
372	if (u->addr)
373		unix_release_addr(u->addr);
374
375	atomic_long_dec(&unix_nr_socks);
376	local_bh_disable();
377	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
378	local_bh_enable();
379#ifdef UNIX_REFCNT_DEBUG
380	printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
381		atomic_long_read(&unix_nr_socks));
382#endif
383}
384
385static void unix_release_sock(struct sock *sk, int embrion)
386{
387	struct unix_sock *u = unix_sk(sk);
388	struct path path;
389	struct sock *skpair;
390	struct sk_buff *skb;
391	int state;
392
393	unix_remove_socket(sk);
394
395	/* Clear state */
396	unix_state_lock(sk);
397	sock_orphan(sk);
398	sk->sk_shutdown = SHUTDOWN_MASK;
399	path	     = u->path;
400	u->path.dentry = NULL;
401	u->path.mnt = NULL;
402	state = sk->sk_state;
403	sk->sk_state = TCP_CLOSE;
404	unix_state_unlock(sk);
405
406	wake_up_interruptible_all(&u->peer_wait);
407
408	skpair = unix_peer(sk);
409
410	if (skpair != NULL) {
411		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
412			unix_state_lock(skpair);
413			/* No more writes */
414			skpair->sk_shutdown = SHUTDOWN_MASK;
415			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
416				skpair->sk_err = ECONNRESET;
417			unix_state_unlock(skpair);
418			skpair->sk_state_change(skpair);
419			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
420		}
421		sock_put(skpair); /* It may now die */
422		unix_peer(sk) = NULL;
423	}
424
425	/* Try to flush out this socket. Throw out buffers at least */
426
427	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
428		if (state == TCP_LISTEN)
429			unix_release_sock(skb->sk, 1);
430		/* passed fds are erased in the kfree_skb hook	      */
431		kfree_skb(skb);
432	}
433
434	if (path.dentry)
435		path_put(&path);
436
437	sock_put(sk);
438
439	/* ---- Socket is dead now and most probably destroyed ---- */
440
441	/*
442	 * Fixme: BSD difference: In BSD all sockets connected to us get
443	 *	  ECONNRESET and we die on the spot. In Linux we behave
444	 *	  like files and pipes do and wait for the last
445	 *	  dereference.
446	 *
447	 * Can't we simply set sock->err?
448	 *
449	 *	  What the above comment does talk about? --ANK(980817)
450	 */
451
452	if (unix_tot_inflight)
453		unix_gc();		/* Garbage collect fds */
454}
455
456static void init_peercred(struct sock *sk)
457{
458	put_pid(sk->sk_peer_pid);
459	if (sk->sk_peer_cred)
460		put_cred(sk->sk_peer_cred);
461	sk->sk_peer_pid  = get_pid(task_tgid(current));
462	sk->sk_peer_cred = get_current_cred();
463}
464
465static void copy_peercred(struct sock *sk, struct sock *peersk)
466{
467	put_pid(sk->sk_peer_pid);
468	if (sk->sk_peer_cred)
469		put_cred(sk->sk_peer_cred);
470	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
471	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
472}
473
474static int unix_listen(struct socket *sock, int backlog)
475{
476	int err;
477	struct sock *sk = sock->sk;
478	struct unix_sock *u = unix_sk(sk);
479	struct pid *old_pid = NULL;
480
481	err = -EOPNOTSUPP;
482	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
483		goto out;	/* Only stream/seqpacket sockets accept */
484	err = -EINVAL;
485	if (!u->addr)
486		goto out;	/* No listens on an unbound socket */
487	unix_state_lock(sk);
488	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
489		goto out_unlock;
490	if (backlog > sk->sk_max_ack_backlog)
491		wake_up_interruptible_all(&u->peer_wait);
492	sk->sk_max_ack_backlog	= backlog;
493	sk->sk_state		= TCP_LISTEN;
494	/* set credentials so connect can copy them */
495	init_peercred(sk);
496	err = 0;
497
498out_unlock:
499	unix_state_unlock(sk);
500	put_pid(old_pid);
501out:
502	return err;
503}
504
505static int unix_release(struct socket *);
506static int unix_bind(struct socket *, struct sockaddr *, int);
507static int unix_stream_connect(struct socket *, struct sockaddr *,
508			       int addr_len, int flags);
509static int unix_socketpair(struct socket *, struct socket *);
510static int unix_accept(struct socket *, struct socket *, int);
511static int unix_getname(struct socket *, struct sockaddr *, int *, int);
512static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
513static unsigned int unix_dgram_poll(struct file *, struct socket *,
514				    poll_table *);
515static int unix_ioctl(struct socket *, unsigned int, unsigned long);
516static int unix_shutdown(struct socket *, int);
517static int unix_stream_sendmsg(struct kiocb *, struct socket *,
518			       struct msghdr *, size_t);
519static int unix_stream_recvmsg(struct kiocb *, struct socket *,
520			       struct msghdr *, size_t, int);
521static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
522			      struct msghdr *, size_t);
523static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
524			      struct msghdr *, size_t, int);
525static int unix_dgram_connect(struct socket *, struct sockaddr *,
526			      int, int);
527static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
528				  struct msghdr *, size_t);
529static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
530				  struct msghdr *, size_t, int);
531
532static void unix_set_peek_off(struct sock *sk, int val)
533{
534	struct unix_sock *u = unix_sk(sk);
535
536	mutex_lock(&u->readlock);
537	sk->sk_peek_off = val;
538	mutex_unlock(&u->readlock);
539}
540
541
542static const struct proto_ops unix_stream_ops = {
543	.family =	PF_UNIX,
544	.owner =	THIS_MODULE,
545	.release =	unix_release,
546	.bind =		unix_bind,
547	.connect =	unix_stream_connect,
548	.socketpair =	unix_socketpair,
549	.accept =	unix_accept,
550	.getname =	unix_getname,
551	.poll =		unix_poll,
552	.ioctl =	unix_ioctl,
553	.listen =	unix_listen,
554	.shutdown =	unix_shutdown,
555	.setsockopt =	sock_no_setsockopt,
556	.getsockopt =	sock_no_getsockopt,
557	.sendmsg =	unix_stream_sendmsg,
558	.recvmsg =	unix_stream_recvmsg,
559	.mmap =		sock_no_mmap,
560	.sendpage =	sock_no_sendpage,
561	.set_peek_off =	unix_set_peek_off,
562};
563
564static const struct proto_ops unix_dgram_ops = {
565	.family =	PF_UNIX,
566	.owner =	THIS_MODULE,
567	.release =	unix_release,
568	.bind =		unix_bind,
569	.connect =	unix_dgram_connect,
570	.socketpair =	unix_socketpair,
571	.accept =	sock_no_accept,
572	.getname =	unix_getname,
573	.poll =		unix_dgram_poll,
574	.ioctl =	unix_ioctl,
575	.listen =	sock_no_listen,
576	.shutdown =	unix_shutdown,
577	.setsockopt =	sock_no_setsockopt,
578	.getsockopt =	sock_no_getsockopt,
579	.sendmsg =	unix_dgram_sendmsg,
580	.recvmsg =	unix_dgram_recvmsg,
581	.mmap =		sock_no_mmap,
582	.sendpage =	sock_no_sendpage,
583	.set_peek_off =	unix_set_peek_off,
584};
585
586static const struct proto_ops unix_seqpacket_ops = {
587	.family =	PF_UNIX,
588	.owner =	THIS_MODULE,
589	.release =	unix_release,
590	.bind =		unix_bind,
591	.connect =	unix_stream_connect,
592	.socketpair =	unix_socketpair,
593	.accept =	unix_accept,
594	.getname =	unix_getname,
595	.poll =		unix_dgram_poll,
596	.ioctl =	unix_ioctl,
597	.listen =	unix_listen,
598	.shutdown =	unix_shutdown,
599	.setsockopt =	sock_no_setsockopt,
600	.getsockopt =	sock_no_getsockopt,
601	.sendmsg =	unix_seqpacket_sendmsg,
602	.recvmsg =	unix_seqpacket_recvmsg,
603	.mmap =		sock_no_mmap,
604	.sendpage =	sock_no_sendpage,
605	.set_peek_off =	unix_set_peek_off,
606};
607
608static struct proto unix_proto = {
609	.name			= "UNIX",
610	.owner			= THIS_MODULE,
611	.obj_size		= sizeof(struct unix_sock),
612};
613
614/*
615 * AF_UNIX sockets do not interact with hardware, hence they
616 * dont trigger interrupts - so it's safe for them to have
617 * bh-unsafe locking for their sk_receive_queue.lock. Split off
618 * this special lock-class by reinitializing the spinlock key:
619 */
620static struct lock_class_key af_unix_sk_receive_queue_lock_key;
621
622static struct sock *unix_create1(struct net *net, struct socket *sock)
623{
624	struct sock *sk = NULL;
625	struct unix_sock *u;
626
627	atomic_long_inc(&unix_nr_socks);
628	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
629		goto out;
630
631	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
632	if (!sk)
633		goto out;
634
635	sock_init_data(sock, sk);
636	lockdep_set_class(&sk->sk_receive_queue.lock,
637				&af_unix_sk_receive_queue_lock_key);
638
639	sk->sk_write_space	= unix_write_space;
640	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
641	sk->sk_destruct		= unix_sock_destructor;
642	u	  = unix_sk(sk);
643	u->path.dentry = NULL;
644	u->path.mnt = NULL;
645	spin_lock_init(&u->lock);
646	atomic_long_set(&u->inflight, 0);
647	INIT_LIST_HEAD(&u->link);
648	mutex_init(&u->readlock); /* single task reading lock */
649	init_waitqueue_head(&u->peer_wait);
650	unix_insert_socket(unix_sockets_unbound(sk), sk);
651out:
652	if (sk == NULL)
653		atomic_long_dec(&unix_nr_socks);
654	else {
655		local_bh_disable();
656		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
657		local_bh_enable();
658	}
659	return sk;
660}
661
662static int unix_create(struct net *net, struct socket *sock, int protocol,
663		       int kern)
664{
665	if (protocol && protocol != PF_UNIX)
666		return -EPROTONOSUPPORT;
667
668	sock->state = SS_UNCONNECTED;
669
670	switch (sock->type) {
671	case SOCK_STREAM:
672		sock->ops = &unix_stream_ops;
673		break;
674		/*
675		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
676		 *	nothing uses it.
677		 */
678	case SOCK_RAW:
679		sock->type = SOCK_DGRAM;
680	case SOCK_DGRAM:
681		sock->ops = &unix_dgram_ops;
682		break;
683	case SOCK_SEQPACKET:
684		sock->ops = &unix_seqpacket_ops;
685		break;
686	default:
687		return -ESOCKTNOSUPPORT;
688	}
689
690	return unix_create1(net, sock) ? 0 : -ENOMEM;
691}
692
693static int unix_release(struct socket *sock)
694{
695	struct sock *sk = sock->sk;
696
697	if (!sk)
698		return 0;
699
700	unix_release_sock(sk, 0);
701	sock->sk = NULL;
702
703	return 0;
704}
705
706static int unix_autobind(struct socket *sock)
707{
708	struct sock *sk = sock->sk;
709	struct net *net = sock_net(sk);
710	struct unix_sock *u = unix_sk(sk);
711	static u32 ordernum = 1;
712	struct unix_address *addr;
713	int err;
714	unsigned int retries = 0;
715
716	mutex_lock(&u->readlock);
717
718	err = 0;
719	if (u->addr)
720		goto out;
721
722	err = -ENOMEM;
723	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
724	if (!addr)
725		goto out;
726
727	addr->name->sun_family = AF_UNIX;
728	atomic_set(&addr->refcnt, 1);
729
730retry:
731	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
732	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
733
734	spin_lock(&unix_table_lock);
735	ordernum = (ordernum+1)&0xFFFFF;
736
737	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
738				      addr->hash)) {
739		spin_unlock(&unix_table_lock);
740		/*
741		 * __unix_find_socket_byname() may take long time if many names
742		 * are already in use.
743		 */
744		cond_resched();
745		/* Give up if all names seems to be in use. */
746		if (retries++ == 0xFFFFF) {
747			err = -ENOSPC;
748			kfree(addr);
749			goto out;
750		}
751		goto retry;
752	}
753	addr->hash ^= sk->sk_type;
754
755	__unix_remove_socket(sk);
756	u->addr = addr;
757	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
758	spin_unlock(&unix_table_lock);
759	err = 0;
760
761out:	mutex_unlock(&u->readlock);
762	return err;
763}
764
765static struct sock *unix_find_other(struct net *net,
766				    struct sockaddr_un *sunname, int len,
767				    int type, unsigned int hash, int *error)
768{
769	struct sock *u;
770	struct path path;
771	int err = 0;
772
773	if (sunname->sun_path[0]) {
774		struct inode *inode;
775		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
776		if (err)
777			goto fail;
778		inode = path.dentry->d_inode;
779		err = inode_permission(inode, MAY_WRITE);
780		if (err)
781			goto put_fail;
782
783		err = -ECONNREFUSED;
784		if (!S_ISSOCK(inode->i_mode))
785			goto put_fail;
786		u = unix_find_socket_byinode(inode);
787		if (!u)
788			goto put_fail;
789
790		if (u->sk_type == type)
791			touch_atime(&path);
792
793		path_put(&path);
794
795		err = -EPROTOTYPE;
796		if (u->sk_type != type) {
797			sock_put(u);
798			goto fail;
799		}
800	} else {
801		err = -ECONNREFUSED;
802		u = unix_find_socket_byname(net, sunname, len, type, hash);
803		if (u) {
804			struct dentry *dentry;
805			dentry = unix_sk(u)->path.dentry;
806			if (dentry)
807				touch_atime(&unix_sk(u)->path);
808		} else
809			goto fail;
810	}
811	return u;
812
813put_fail:
814	path_put(&path);
815fail:
816	*error = err;
817	return NULL;
818}
819
820static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
821{
822	struct dentry *dentry;
823	struct path path;
824	int err = 0;
825	/*
826	 * Get the parent directory, calculate the hash for last
827	 * component.
828	 */
829	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
830	err = PTR_ERR(dentry);
831	if (IS_ERR(dentry))
832		return err;
833
834	/*
835	 * All right, let's create it.
836	 */
837	err = security_path_mknod(&path, dentry, mode, 0);
838	if (!err) {
839		err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
840		if (!err) {
841			res->mnt = mntget(path.mnt);
842			res->dentry = dget(dentry);
843		}
844	}
845	done_path_create(&path, dentry);
846	return err;
847}
848
849static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
850{
851	struct sock *sk = sock->sk;
852	struct net *net = sock_net(sk);
853	struct unix_sock *u = unix_sk(sk);
854	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
855	char *sun_path = sunaddr->sun_path;
856	int err;
857	unsigned int hash;
858	struct unix_address *addr;
859	struct hlist_head *list;
860
861	err = -EINVAL;
862	if (sunaddr->sun_family != AF_UNIX)
863		goto out;
864
865	if (addr_len == sizeof(short)) {
866		err = unix_autobind(sock);
867		goto out;
868	}
869
870	err = unix_mkname(sunaddr, addr_len, &hash);
871	if (err < 0)
872		goto out;
873	addr_len = err;
874
875	mutex_lock(&u->readlock);
876
877	err = -EINVAL;
878	if (u->addr)
879		goto out_up;
880
881	err = -ENOMEM;
882	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
883	if (!addr)
884		goto out_up;
885
886	memcpy(addr->name, sunaddr, addr_len);
887	addr->len = addr_len;
888	addr->hash = hash ^ sk->sk_type;
889	atomic_set(&addr->refcnt, 1);
890
891	if (sun_path[0]) {
892		struct path path;
893		umode_t mode = S_IFSOCK |
894		       (SOCK_INODE(sock)->i_mode & ~current_umask());
895		err = unix_mknod(sun_path, mode, &path);
896		if (err) {
897			if (err == -EEXIST)
898				err = -EADDRINUSE;
899			unix_release_addr(addr);
900			goto out_up;
901		}
902		addr->hash = UNIX_HASH_SIZE;
903		hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
904		spin_lock(&unix_table_lock);
905		u->path = path;
906		list = &unix_socket_table[hash];
907	} else {
908		spin_lock(&unix_table_lock);
909		err = -EADDRINUSE;
910		if (__unix_find_socket_byname(net, sunaddr, addr_len,
911					      sk->sk_type, hash)) {
912			unix_release_addr(addr);
913			goto out_unlock;
914		}
915
916		list = &unix_socket_table[addr->hash];
917	}
918
919	err = 0;
920	__unix_remove_socket(sk);
921	u->addr = addr;
922	__unix_insert_socket(list, sk);
923
924out_unlock:
925	spin_unlock(&unix_table_lock);
926out_up:
927	mutex_unlock(&u->readlock);
928out:
929	return err;
930}
931
932static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
933{
934	if (unlikely(sk1 == sk2) || !sk2) {
935		unix_state_lock(sk1);
936		return;
937	}
938	if (sk1 < sk2) {
939		unix_state_lock(sk1);
940		unix_state_lock_nested(sk2);
941	} else {
942		unix_state_lock(sk2);
943		unix_state_lock_nested(sk1);
944	}
945}
946
947static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
948{
949	if (unlikely(sk1 == sk2) || !sk2) {
950		unix_state_unlock(sk1);
951		return;
952	}
953	unix_state_unlock(sk1);
954	unix_state_unlock(sk2);
955}
956
957static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
958			      int alen, int flags)
959{
960	struct sock *sk = sock->sk;
961	struct net *net = sock_net(sk);
962	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
963	struct sock *other;
964	unsigned int hash;
965	int err;
966
967	if (addr->sa_family != AF_UNSPEC) {
968		err = unix_mkname(sunaddr, alen, &hash);
969		if (err < 0)
970			goto out;
971		alen = err;
972
973		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
974		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
975			goto out;
976
977restart:
978		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
979		if (!other)
980			goto out;
981
982		unix_state_double_lock(sk, other);
983
984		/* Apparently VFS overslept socket death. Retry. */
985		if (sock_flag(other, SOCK_DEAD)) {
986			unix_state_double_unlock(sk, other);
987			sock_put(other);
988			goto restart;
989		}
990
991		err = -EPERM;
992		if (!unix_may_send(sk, other))
993			goto out_unlock;
994
995		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
996		if (err)
997			goto out_unlock;
998
999	} else {
1000		/*
1001		 *	1003.1g breaking connected state with AF_UNSPEC
1002		 */
1003		other = NULL;
1004		unix_state_double_lock(sk, other);
1005	}
1006
1007	/*
1008	 * If it was connected, reconnect.
1009	 */
1010	if (unix_peer(sk)) {
1011		struct sock *old_peer = unix_peer(sk);
1012		unix_peer(sk) = other;
1013		unix_state_double_unlock(sk, other);
1014
1015		if (other != old_peer)
1016			unix_dgram_disconnected(sk, old_peer);
1017		sock_put(old_peer);
1018	} else {
1019		unix_peer(sk) = other;
1020		unix_state_double_unlock(sk, other);
1021	}
1022	return 0;
1023
1024out_unlock:
1025	unix_state_double_unlock(sk, other);
1026	sock_put(other);
1027out:
1028	return err;
1029}
1030
1031static long unix_wait_for_peer(struct sock *other, long timeo)
1032{
1033	struct unix_sock *u = unix_sk(other);
1034	int sched;
1035	DEFINE_WAIT(wait);
1036
1037	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1038
1039	sched = !sock_flag(other, SOCK_DEAD) &&
1040		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1041		unix_recvq_full(other);
1042
1043	unix_state_unlock(other);
1044
1045	if (sched)
1046		timeo = schedule_timeout(timeo);
1047
1048	finish_wait(&u->peer_wait, &wait);
1049	return timeo;
1050}
1051
1052static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1053			       int addr_len, int flags)
1054{
1055	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1056	struct sock *sk = sock->sk;
1057	struct net *net = sock_net(sk);
1058	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1059	struct sock *newsk = NULL;
1060	struct sock *other = NULL;
1061	struct sk_buff *skb = NULL;
1062	unsigned int hash;
1063	int st;
1064	int err;
1065	long timeo;
1066
1067	err = unix_mkname(sunaddr, addr_len, &hash);
1068	if (err < 0)
1069		goto out;
1070	addr_len = err;
1071
1072	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1073	    (err = unix_autobind(sock)) != 0)
1074		goto out;
1075
1076	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1077
1078	/* First of all allocate resources.
1079	   If we will make it after state is locked,
1080	   we will have to recheck all again in any case.
1081	 */
1082
1083	err = -ENOMEM;
1084
1085	/* create new sock for complete connection */
1086	newsk = unix_create1(sock_net(sk), NULL);
1087	if (newsk == NULL)
1088		goto out;
1089
1090	/* Allocate skb for sending to listening sock */
1091	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1092	if (skb == NULL)
1093		goto out;
1094
1095restart:
1096	/*  Find listening sock. */
1097	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1098	if (!other)
1099		goto out;
1100
1101	/* Latch state of peer */
1102	unix_state_lock(other);
1103
1104	/* Apparently VFS overslept socket death. Retry. */
1105	if (sock_flag(other, SOCK_DEAD)) {
1106		unix_state_unlock(other);
1107		sock_put(other);
1108		goto restart;
1109	}
1110
1111	err = -ECONNREFUSED;
1112	if (other->sk_state != TCP_LISTEN)
1113		goto out_unlock;
1114	if (other->sk_shutdown & RCV_SHUTDOWN)
1115		goto out_unlock;
1116
1117	if (unix_recvq_full(other)) {
1118		err = -EAGAIN;
1119		if (!timeo)
1120			goto out_unlock;
1121
1122		timeo = unix_wait_for_peer(other, timeo);
1123
1124		err = sock_intr_errno(timeo);
1125		if (signal_pending(current))
1126			goto out;
1127		sock_put(other);
1128		goto restart;
1129	}
1130
1131	/* Latch our state.
1132
1133	   It is tricky place. We need to grab our state lock and cannot
1134	   drop lock on peer. It is dangerous because deadlock is
1135	   possible. Connect to self case and simultaneous
1136	   attempt to connect are eliminated by checking socket
1137	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1138	   check this before attempt to grab lock.
1139
1140	   Well, and we have to recheck the state after socket locked.
1141	 */
1142	st = sk->sk_state;
1143
1144	switch (st) {
1145	case TCP_CLOSE:
1146		/* This is ok... continue with connect */
1147		break;
1148	case TCP_ESTABLISHED:
1149		/* Socket is already connected */
1150		err = -EISCONN;
1151		goto out_unlock;
1152	default:
1153		err = -EINVAL;
1154		goto out_unlock;
1155	}
1156
1157	unix_state_lock_nested(sk);
1158
1159	if (sk->sk_state != st) {
1160		unix_state_unlock(sk);
1161		unix_state_unlock(other);
1162		sock_put(other);
1163		goto restart;
1164	}
1165
1166	err = security_unix_stream_connect(sk, other, newsk);
1167	if (err) {
1168		unix_state_unlock(sk);
1169		goto out_unlock;
1170	}
1171
1172	/* The way is open! Fastly set all the necessary fields... */
1173
1174	sock_hold(sk);
1175	unix_peer(newsk)	= sk;
1176	newsk->sk_state		= TCP_ESTABLISHED;
1177	newsk->sk_type		= sk->sk_type;
1178	init_peercred(newsk);
1179	newu = unix_sk(newsk);
1180	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1181	otheru = unix_sk(other);
1182
1183	/* copy address information from listening to new sock*/
1184	if (otheru->addr) {
1185		atomic_inc(&otheru->addr->refcnt);
1186		newu->addr = otheru->addr;
1187	}
1188	if (otheru->path.dentry) {
1189		path_get(&otheru->path);
1190		newu->path = otheru->path;
1191	}
1192
1193	/* Set credentials */
1194	copy_peercred(sk, other);
1195
1196	sock->state	= SS_CONNECTED;
1197	sk->sk_state	= TCP_ESTABLISHED;
1198	sock_hold(newsk);
1199
1200	smp_mb__after_atomic_inc();	/* sock_hold() does an atomic_inc() */
1201	unix_peer(sk)	= newsk;
1202
1203	unix_state_unlock(sk);
1204
1205	/* take ten and and send info to listening sock */
1206	spin_lock(&other->sk_receive_queue.lock);
1207	__skb_queue_tail(&other->sk_receive_queue, skb);
1208	spin_unlock(&other->sk_receive_queue.lock);
1209	unix_state_unlock(other);
1210	other->sk_data_ready(other, 0);
1211	sock_put(other);
1212	return 0;
1213
1214out_unlock:
1215	if (other)
1216		unix_state_unlock(other);
1217
1218out:
1219	kfree_skb(skb);
1220	if (newsk)
1221		unix_release_sock(newsk, 0);
1222	if (other)
1223		sock_put(other);
1224	return err;
1225}
1226
1227static int unix_socketpair(struct socket *socka, struct socket *sockb)
1228{
1229	struct sock *ska = socka->sk, *skb = sockb->sk;
1230
1231	/* Join our sockets back to back */
1232	sock_hold(ska);
1233	sock_hold(skb);
1234	unix_peer(ska) = skb;
1235	unix_peer(skb) = ska;
1236	init_peercred(ska);
1237	init_peercred(skb);
1238
1239	if (ska->sk_type != SOCK_DGRAM) {
1240		ska->sk_state = TCP_ESTABLISHED;
1241		skb->sk_state = TCP_ESTABLISHED;
1242		socka->state  = SS_CONNECTED;
1243		sockb->state  = SS_CONNECTED;
1244	}
1245	return 0;
1246}
1247
1248static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1249{
1250	struct sock *sk = sock->sk;
1251	struct sock *tsk;
1252	struct sk_buff *skb;
1253	int err;
1254
1255	err = -EOPNOTSUPP;
1256	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1257		goto out;
1258
1259	err = -EINVAL;
1260	if (sk->sk_state != TCP_LISTEN)
1261		goto out;
1262
1263	/* If socket state is TCP_LISTEN it cannot change (for now...),
1264	 * so that no locks are necessary.
1265	 */
1266
1267	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1268	if (!skb) {
1269		/* This means receive shutdown. */
1270		if (err == 0)
1271			err = -EINVAL;
1272		goto out;
1273	}
1274
1275	tsk = skb->sk;
1276	skb_free_datagram(sk, skb);
1277	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1278
1279	/* attach accepted sock to socket */
1280	unix_state_lock(tsk);
1281	newsock->state = SS_CONNECTED;
1282	sock_graft(tsk, newsock);
1283	unix_state_unlock(tsk);
1284	return 0;
1285
1286out:
1287	return err;
1288}
1289
1290
1291static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1292{
1293	struct sock *sk = sock->sk;
1294	struct unix_sock *u;
1295	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1296	int err = 0;
1297
1298	if (peer) {
1299		sk = unix_peer_get(sk);
1300
1301		err = -ENOTCONN;
1302		if (!sk)
1303			goto out;
1304		err = 0;
1305	} else {
1306		sock_hold(sk);
1307	}
1308
1309	u = unix_sk(sk);
1310	unix_state_lock(sk);
1311	if (!u->addr) {
1312		sunaddr->sun_family = AF_UNIX;
1313		sunaddr->sun_path[0] = 0;
1314		*uaddr_len = sizeof(short);
1315	} else {
1316		struct unix_address *addr = u->addr;
1317
1318		*uaddr_len = addr->len;
1319		memcpy(sunaddr, addr->name, *uaddr_len);
1320	}
1321	unix_state_unlock(sk);
1322	sock_put(sk);
1323out:
1324	return err;
1325}
1326
1327static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1328{
1329	int i;
1330
1331	scm->fp = UNIXCB(skb).fp;
1332	UNIXCB(skb).fp = NULL;
1333
1334	for (i = scm->fp->count-1; i >= 0; i--)
1335		unix_notinflight(scm->fp->fp[i]);
1336}
1337
1338static void unix_destruct_scm(struct sk_buff *skb)
1339{
1340	struct scm_cookie scm;
1341	memset(&scm, 0, sizeof(scm));
1342	scm.pid  = UNIXCB(skb).pid;
1343	scm.cred = UNIXCB(skb).cred;
1344	if (UNIXCB(skb).fp)
1345		unix_detach_fds(&scm, skb);
1346
1347	/* Alas, it calls VFS */
1348	/* So fscking what? fput() had been SMP-safe since the last Summer */
1349	scm_destroy(&scm);
1350	sock_wfree(skb);
1351}
1352
1353#define MAX_RECURSION_LEVEL 4
1354
1355static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1356{
1357	int i;
1358	unsigned char max_level = 0;
1359	int unix_sock_count = 0;
1360
1361	for (i = scm->fp->count - 1; i >= 0; i--) {
1362		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1363
1364		if (sk) {
1365			unix_sock_count++;
1366			max_level = max(max_level,
1367					unix_sk(sk)->recursion_level);
1368		}
1369	}
1370	if (unlikely(max_level > MAX_RECURSION_LEVEL))
1371		return -ETOOMANYREFS;
1372
1373	/*
1374	 * Need to duplicate file references for the sake of garbage
1375	 * collection.  Otherwise a socket in the fps might become a
1376	 * candidate for GC while the skb is not yet queued.
1377	 */
1378	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1379	if (!UNIXCB(skb).fp)
1380		return -ENOMEM;
1381
1382	if (unix_sock_count) {
1383		for (i = scm->fp->count - 1; i >= 0; i--)
1384			unix_inflight(scm->fp->fp[i]);
1385	}
1386	return max_level;
1387}
1388
1389static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1390{
1391	int err = 0;
1392
1393	UNIXCB(skb).pid  = get_pid(scm->pid);
1394	if (scm->cred)
1395		UNIXCB(skb).cred = get_cred(scm->cred);
1396	UNIXCB(skb).fp = NULL;
1397	if (scm->fp && send_fds)
1398		err = unix_attach_fds(scm, skb);
1399
1400	skb->destructor = unix_destruct_scm;
1401	return err;
1402}
1403
1404/*
1405 * Some apps rely on write() giving SCM_CREDENTIALS
1406 * We include credentials if source or destination socket
1407 * asserted SOCK_PASSCRED.
1408 */
1409static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1410			    const struct sock *other)
1411{
1412	if (UNIXCB(skb).cred)
1413		return;
1414	if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1415	    !other->sk_socket ||
1416	    test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1417		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1418		UNIXCB(skb).cred = get_current_cred();
1419	}
1420}
1421
1422/*
1423 *	Send AF_UNIX data.
1424 */
1425
1426static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1427			      struct msghdr *msg, size_t len)
1428{
1429	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1430	struct sock *sk = sock->sk;
1431	struct net *net = sock_net(sk);
1432	struct unix_sock *u = unix_sk(sk);
1433	struct sockaddr_un *sunaddr = msg->msg_name;
1434	struct sock *other = NULL;
1435	int namelen = 0; /* fake GCC */
1436	int err;
1437	unsigned int hash;
1438	struct sk_buff *skb;
1439	long timeo;
1440	struct scm_cookie tmp_scm;
1441	int max_level;
1442	int data_len = 0;
1443
1444	if (NULL == siocb->scm)
1445		siocb->scm = &tmp_scm;
1446	wait_for_unix_gc();
1447	err = scm_send(sock, msg, siocb->scm, false);
1448	if (err < 0)
1449		return err;
1450
1451	err = -EOPNOTSUPP;
1452	if (msg->msg_flags&MSG_OOB)
1453		goto out;
1454
1455	if (msg->msg_namelen) {
1456		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1457		if (err < 0)
1458			goto out;
1459		namelen = err;
1460	} else {
1461		sunaddr = NULL;
1462		err = -ENOTCONN;
1463		other = unix_peer_get(sk);
1464		if (!other)
1465			goto out;
1466	}
1467
1468	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1469	    && (err = unix_autobind(sock)) != 0)
1470		goto out;
1471
1472	err = -EMSGSIZE;
1473	if (len > sk->sk_sndbuf - 32)
1474		goto out;
1475
1476	if (len > SKB_MAX_ALLOC)
1477		data_len = min_t(size_t,
1478				 len - SKB_MAX_ALLOC,
1479				 MAX_SKB_FRAGS * PAGE_SIZE);
1480
1481	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1482				   msg->msg_flags & MSG_DONTWAIT, &err);
1483	if (skb == NULL)
1484		goto out;
1485
1486	err = unix_scm_to_skb(siocb->scm, skb, true);
1487	if (err < 0)
1488		goto out_free;
1489	max_level = err + 1;
1490	unix_get_secdata(siocb->scm, skb);
1491
1492	skb_put(skb, len - data_len);
1493	skb->data_len = data_len;
1494	skb->len = len;
1495	err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, len);
1496	if (err)
1497		goto out_free;
1498
1499	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1500
1501restart:
1502	if (!other) {
1503		err = -ECONNRESET;
1504		if (sunaddr == NULL)
1505			goto out_free;
1506
1507		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1508					hash, &err);
1509		if (other == NULL)
1510			goto out_free;
1511	}
1512
1513	if (sk_filter(other, skb) < 0) {
1514		/* Toss the packet but do not return any error to the sender */
1515		err = len;
1516		goto out_free;
1517	}
1518
1519	unix_state_lock(other);
1520	err = -EPERM;
1521	if (!unix_may_send(sk, other))
1522		goto out_unlock;
1523
1524	if (sock_flag(other, SOCK_DEAD)) {
1525		/*
1526		 *	Check with 1003.1g - what should
1527		 *	datagram error
1528		 */
1529		unix_state_unlock(other);
1530		sock_put(other);
1531
1532		err = 0;
1533		unix_state_lock(sk);
1534		if (unix_peer(sk) == other) {
1535			unix_peer(sk) = NULL;
1536			unix_state_unlock(sk);
1537
1538			unix_dgram_disconnected(sk, other);
1539			sock_put(other);
1540			err = -ECONNREFUSED;
1541		} else {
1542			unix_state_unlock(sk);
1543		}
1544
1545		other = NULL;
1546		if (err)
1547			goto out_free;
1548		goto restart;
1549	}
1550
1551	err = -EPIPE;
1552	if (other->sk_shutdown & RCV_SHUTDOWN)
1553		goto out_unlock;
1554
1555	if (sk->sk_type != SOCK_SEQPACKET) {
1556		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1557		if (err)
1558			goto out_unlock;
1559	}
1560
1561	if (unix_peer(other) != sk && unix_recvq_full(other)) {
1562		if (!timeo) {
1563			err = -EAGAIN;
1564			goto out_unlock;
1565		}
1566
1567		timeo = unix_wait_for_peer(other, timeo);
1568
1569		err = sock_intr_errno(timeo);
1570		if (signal_pending(current))
1571			goto out_free;
1572
1573		goto restart;
1574	}
1575
1576	if (sock_flag(other, SOCK_RCVTSTAMP))
1577		__net_timestamp(skb);
1578	maybe_add_creds(skb, sock, other);
1579	skb_queue_tail(&other->sk_receive_queue, skb);
1580	if (max_level > unix_sk(other)->recursion_level)
1581		unix_sk(other)->recursion_level = max_level;
1582	unix_state_unlock(other);
1583	other->sk_data_ready(other, len);
1584	sock_put(other);
1585	scm_destroy(siocb->scm);
1586	return len;
1587
1588out_unlock:
1589	unix_state_unlock(other);
1590out_free:
1591	kfree_skb(skb);
1592out:
1593	if (other)
1594		sock_put(other);
1595	scm_destroy(siocb->scm);
1596	return err;
1597}
1598
1599
1600static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1601			       struct msghdr *msg, size_t len)
1602{
1603	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1604	struct sock *sk = sock->sk;
1605	struct sock *other = NULL;
1606	int err, size;
1607	struct sk_buff *skb;
1608	int sent = 0;
1609	struct scm_cookie tmp_scm;
1610	bool fds_sent = false;
1611	int max_level;
1612
1613	if (NULL == siocb->scm)
1614		siocb->scm = &tmp_scm;
1615	wait_for_unix_gc();
1616	err = scm_send(sock, msg, siocb->scm, false);
1617	if (err < 0)
1618		return err;
1619
1620	err = -EOPNOTSUPP;
1621	if (msg->msg_flags&MSG_OOB)
1622		goto out_err;
1623
1624	if (msg->msg_namelen) {
1625		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1626		goto out_err;
1627	} else {
1628		err = -ENOTCONN;
1629		other = unix_peer(sk);
1630		if (!other)
1631			goto out_err;
1632	}
1633
1634	if (sk->sk_shutdown & SEND_SHUTDOWN)
1635		goto pipe_err;
1636
1637	while (sent < len) {
1638		/*
1639		 *	Optimisation for the fact that under 0.01% of X
1640		 *	messages typically need breaking up.
1641		 */
1642
1643		size = len-sent;
1644
1645		/* Keep two messages in the pipe so it schedules better */
1646		if (size > ((sk->sk_sndbuf >> 1) - 64))
1647			size = (sk->sk_sndbuf >> 1) - 64;
1648
1649		if (size > SKB_MAX_ALLOC)
1650			size = SKB_MAX_ALLOC;
1651
1652		/*
1653		 *	Grab a buffer
1654		 */
1655
1656		skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT,
1657					  &err);
1658
1659		if (skb == NULL)
1660			goto out_err;
1661
1662		/*
1663		 *	If you pass two values to the sock_alloc_send_skb
1664		 *	it tries to grab the large buffer with GFP_NOFS
1665		 *	(which can fail easily), and if it fails grab the
1666		 *	fallback size buffer which is under a page and will
1667		 *	succeed. [Alan]
1668		 */
1669		size = min_t(int, size, skb_tailroom(skb));
1670
1671
1672		/* Only send the fds in the first buffer */
1673		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
1674		if (err < 0) {
1675			kfree_skb(skb);
1676			goto out_err;
1677		}
1678		max_level = err + 1;
1679		fds_sent = true;
1680
1681		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
1682		if (err) {
1683			kfree_skb(skb);
1684			goto out_err;
1685		}
1686
1687		unix_state_lock(other);
1688
1689		if (sock_flag(other, SOCK_DEAD) ||
1690		    (other->sk_shutdown & RCV_SHUTDOWN))
1691			goto pipe_err_free;
1692
1693		maybe_add_creds(skb, sock, other);
1694		skb_queue_tail(&other->sk_receive_queue, skb);
1695		if (max_level > unix_sk(other)->recursion_level)
1696			unix_sk(other)->recursion_level = max_level;
1697		unix_state_unlock(other);
1698		other->sk_data_ready(other, size);
1699		sent += size;
1700	}
1701
1702	scm_destroy(siocb->scm);
1703	siocb->scm = NULL;
1704
1705	return sent;
1706
1707pipe_err_free:
1708	unix_state_unlock(other);
1709	kfree_skb(skb);
1710pipe_err:
1711	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1712		send_sig(SIGPIPE, current, 0);
1713	err = -EPIPE;
1714out_err:
1715	scm_destroy(siocb->scm);
1716	siocb->scm = NULL;
1717	return sent ? : err;
1718}
1719
1720static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1721				  struct msghdr *msg, size_t len)
1722{
1723	int err;
1724	struct sock *sk = sock->sk;
1725
1726	err = sock_error(sk);
1727	if (err)
1728		return err;
1729
1730	if (sk->sk_state != TCP_ESTABLISHED)
1731		return -ENOTCONN;
1732
1733	if (msg->msg_namelen)
1734		msg->msg_namelen = 0;
1735
1736	return unix_dgram_sendmsg(kiocb, sock, msg, len);
1737}
1738
1739static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
1740			      struct msghdr *msg, size_t size,
1741			      int flags)
1742{
1743	struct sock *sk = sock->sk;
1744
1745	if (sk->sk_state != TCP_ESTABLISHED)
1746		return -ENOTCONN;
1747
1748	return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
1749}
1750
1751static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1752{
1753	struct unix_sock *u = unix_sk(sk);
1754
1755	msg->msg_namelen = 0;
1756	if (u->addr) {
1757		msg->msg_namelen = u->addr->len;
1758		memcpy(msg->msg_name, u->addr->name, u->addr->len);
1759	}
1760}
1761
1762static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1763			      struct msghdr *msg, size_t size,
1764			      int flags)
1765{
1766	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1767	struct scm_cookie tmp_scm;
1768	struct sock *sk = sock->sk;
1769	struct unix_sock *u = unix_sk(sk);
1770	int noblock = flags & MSG_DONTWAIT;
1771	struct sk_buff *skb;
1772	int err;
1773	int peeked, skip;
1774
1775	err = -EOPNOTSUPP;
1776	if (flags&MSG_OOB)
1777		goto out;
1778
1779	msg->msg_namelen = 0;
1780
1781	err = mutex_lock_interruptible(&u->readlock);
1782	if (err) {
1783		err = sock_intr_errno(sock_rcvtimeo(sk, noblock));
1784		goto out;
1785	}
1786
1787	skip = sk_peek_offset(sk, flags);
1788
1789	skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1790	if (!skb) {
1791		unix_state_lock(sk);
1792		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1793		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1794		    (sk->sk_shutdown & RCV_SHUTDOWN))
1795			err = 0;
1796		unix_state_unlock(sk);
1797		goto out_unlock;
1798	}
1799
1800	wake_up_interruptible_sync_poll(&u->peer_wait,
1801					POLLOUT | POLLWRNORM | POLLWRBAND);
1802
1803	if (msg->msg_name)
1804		unix_copy_addr(msg, skb->sk);
1805
1806	if (size > skb->len - skip)
1807		size = skb->len - skip;
1808	else if (size < skb->len - skip)
1809		msg->msg_flags |= MSG_TRUNC;
1810
1811	err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
1812	if (err)
1813		goto out_free;
1814
1815	if (sock_flag(sk, SOCK_RCVTSTAMP))
1816		__sock_recv_timestamp(msg, sk, skb);
1817
1818	if (!siocb->scm) {
1819		siocb->scm = &tmp_scm;
1820		memset(&tmp_scm, 0, sizeof(tmp_scm));
1821	}
1822	scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
1823	unix_set_secdata(siocb->scm, skb);
1824
1825	if (!(flags & MSG_PEEK)) {
1826		if (UNIXCB(skb).fp)
1827			unix_detach_fds(siocb->scm, skb);
1828
1829		sk_peek_offset_bwd(sk, skb->len);
1830	} else {
1831		/* It is questionable: on PEEK we could:
1832		   - do not return fds - good, but too simple 8)
1833		   - return fds, and do not return them on read (old strategy,
1834		     apparently wrong)
1835		   - clone fds (I chose it for now, it is the most universal
1836		     solution)
1837
1838		   POSIX 1003.1g does not actually define this clearly
1839		   at all. POSIX 1003.1g doesn't define a lot of things
1840		   clearly however!
1841
1842		*/
1843
1844		sk_peek_offset_fwd(sk, size);
1845
1846		if (UNIXCB(skb).fp)
1847			siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1848	}
1849	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1850
1851	scm_recv(sock, msg, siocb->scm, flags);
1852
1853out_free:
1854	skb_free_datagram(sk, skb);
1855out_unlock:
1856	mutex_unlock(&u->readlock);
1857out:
1858	return err;
1859}
1860
1861/*
1862 *	Sleep until data has arrive. But check for races..
1863 */
1864
1865static long unix_stream_data_wait(struct sock *sk, long timeo)
1866{
1867	DEFINE_WAIT(wait);
1868
1869	unix_state_lock(sk);
1870
1871	for (;;) {
1872		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1873
1874		if (!skb_queue_empty(&sk->sk_receive_queue) ||
1875		    sk->sk_err ||
1876		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1877		    signal_pending(current) ||
1878		    !timeo)
1879			break;
1880
1881		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1882		unix_state_unlock(sk);
1883		timeo = schedule_timeout(timeo);
1884		unix_state_lock(sk);
1885		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1886	}
1887
1888	finish_wait(sk_sleep(sk), &wait);
1889	unix_state_unlock(sk);
1890	return timeo;
1891}
1892
1893
1894
1895static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1896			       struct msghdr *msg, size_t size,
1897			       int flags)
1898{
1899	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1900	struct scm_cookie tmp_scm;
1901	struct sock *sk = sock->sk;
1902	struct unix_sock *u = unix_sk(sk);
1903	struct sockaddr_un *sunaddr = msg->msg_name;
1904	int copied = 0;
1905	int check_creds = 0;
1906	int target;
1907	int err = 0;
1908	long timeo;
1909	int skip;
1910
1911	err = -EINVAL;
1912	if (sk->sk_state != TCP_ESTABLISHED)
1913		goto out;
1914
1915	err = -EOPNOTSUPP;
1916	if (flags&MSG_OOB)
1917		goto out;
1918
1919	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1920	timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1921
1922	msg->msg_namelen = 0;
1923
1924	/* Lock the socket to prevent queue disordering
1925	 * while sleeps in memcpy_tomsg
1926	 */
1927
1928	if (!siocb->scm) {
1929		siocb->scm = &tmp_scm;
1930		memset(&tmp_scm, 0, sizeof(tmp_scm));
1931	}
1932
1933	err = mutex_lock_interruptible(&u->readlock);
1934	if (err) {
1935		err = sock_intr_errno(timeo);
1936		goto out;
1937	}
1938
1939	skip = sk_peek_offset(sk, flags);
1940
1941	do {
1942		int chunk;
1943		struct sk_buff *skb;
1944
1945		unix_state_lock(sk);
1946		skb = skb_peek(&sk->sk_receive_queue);
1947again:
1948		if (skb == NULL) {
1949			unix_sk(sk)->recursion_level = 0;
1950			if (copied >= target)
1951				goto unlock;
1952
1953			/*
1954			 *	POSIX 1003.1g mandates this order.
1955			 */
1956
1957			err = sock_error(sk);
1958			if (err)
1959				goto unlock;
1960			if (sk->sk_shutdown & RCV_SHUTDOWN)
1961				goto unlock;
1962
1963			unix_state_unlock(sk);
1964			err = -EAGAIN;
1965			if (!timeo)
1966				break;
1967			mutex_unlock(&u->readlock);
1968
1969			timeo = unix_stream_data_wait(sk, timeo);
1970
1971			if (signal_pending(current)
1972			    ||  mutex_lock_interruptible(&u->readlock)) {
1973				err = sock_intr_errno(timeo);
1974				goto out;
1975			}
1976
1977			continue;
1978 unlock:
1979			unix_state_unlock(sk);
1980			break;
1981		}
1982
1983		if (skip >= skb->len) {
1984			skip -= skb->len;
1985			skb = skb_peek_next(skb, &sk->sk_receive_queue);
1986			goto again;
1987		}
1988
1989		unix_state_unlock(sk);
1990
1991		if (check_creds) {
1992			/* Never glue messages from different writers */
1993			if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
1994			    (UNIXCB(skb).cred != siocb->scm->cred))
1995				break;
1996		} else {
1997			/* Copy credentials */
1998			scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
1999			check_creds = 1;
2000		}
2001
2002		/* Copy address just once */
2003		if (sunaddr) {
2004			unix_copy_addr(msg, skb->sk);
2005			sunaddr = NULL;
2006		}
2007
2008		chunk = min_t(unsigned int, skb->len - skip, size);
2009		if (memcpy_toiovec(msg->msg_iov, skb->data + skip, chunk)) {
2010			if (copied == 0)
2011				copied = -EFAULT;
2012			break;
2013		}
2014		copied += chunk;
2015		size -= chunk;
2016
2017		/* Mark read part of skb as used */
2018		if (!(flags & MSG_PEEK)) {
2019			skb_pull(skb, chunk);
2020
2021			sk_peek_offset_bwd(sk, chunk);
2022
2023			if (UNIXCB(skb).fp)
2024				unix_detach_fds(siocb->scm, skb);
2025
2026			if (skb->len)
2027				break;
2028
2029			skb_unlink(skb, &sk->sk_receive_queue);
2030			consume_skb(skb);
2031
2032			if (siocb->scm->fp)
2033				break;
2034		} else {
2035			/* It is questionable, see note in unix_dgram_recvmsg.
2036			 */
2037			if (UNIXCB(skb).fp)
2038				siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2039
2040			sk_peek_offset_fwd(sk, chunk);
2041
2042			break;
2043		}
2044	} while (size);
2045
2046	mutex_unlock(&u->readlock);
2047	scm_recv(sock, msg, siocb->scm, flags);
2048out:
2049	return copied ? : err;
2050}
2051
2052static int unix_shutdown(struct socket *sock, int mode)
2053{
2054	struct sock *sk = sock->sk;
2055	struct sock *other;
2056
2057	if (mode < SHUT_RD || mode > SHUT_RDWR)
2058		return -EINVAL;
2059	/* This maps:
2060	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2061	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2062	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2063	 */
2064	++mode;
2065
2066	unix_state_lock(sk);
2067	sk->sk_shutdown |= mode;
2068	other = unix_peer(sk);
2069	if (other)
2070		sock_hold(other);
2071	unix_state_unlock(sk);
2072	sk->sk_state_change(sk);
2073
2074	if (other &&
2075		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2076
2077		int peer_mode = 0;
2078
2079		if (mode&RCV_SHUTDOWN)
2080			peer_mode |= SEND_SHUTDOWN;
2081		if (mode&SEND_SHUTDOWN)
2082			peer_mode |= RCV_SHUTDOWN;
2083		unix_state_lock(other);
2084		other->sk_shutdown |= peer_mode;
2085		unix_state_unlock(other);
2086		other->sk_state_change(other);
2087		if (peer_mode == SHUTDOWN_MASK)
2088			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2089		else if (peer_mode & RCV_SHUTDOWN)
2090			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2091	}
2092	if (other)
2093		sock_put(other);
2094
2095	return 0;
2096}
2097
2098long unix_inq_len(struct sock *sk)
2099{
2100	struct sk_buff *skb;
2101	long amount = 0;
2102
2103	if (sk->sk_state == TCP_LISTEN)
2104		return -EINVAL;
2105
2106	spin_lock(&sk->sk_receive_queue.lock);
2107	if (sk->sk_type == SOCK_STREAM ||
2108	    sk->sk_type == SOCK_SEQPACKET) {
2109		skb_queue_walk(&sk->sk_receive_queue, skb)
2110			amount += skb->len;
2111	} else {
2112		skb = skb_peek(&sk->sk_receive_queue);
2113		if (skb)
2114			amount = skb->len;
2115	}
2116	spin_unlock(&sk->sk_receive_queue.lock);
2117
2118	return amount;
2119}
2120EXPORT_SYMBOL_GPL(unix_inq_len);
2121
2122long unix_outq_len(struct sock *sk)
2123{
2124	return sk_wmem_alloc_get(sk);
2125}
2126EXPORT_SYMBOL_GPL(unix_outq_len);
2127
2128static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2129{
2130	struct sock *sk = sock->sk;
2131	long amount = 0;
2132	int err;
2133
2134	switch (cmd) {
2135	case SIOCOUTQ:
2136		amount = unix_outq_len(sk);
2137		err = put_user(amount, (int __user *)arg);
2138		break;
2139	case SIOCINQ:
2140		amount = unix_inq_len(sk);
2141		if (amount < 0)
2142			err = amount;
2143		else
2144			err = put_user(amount, (int __user *)arg);
2145		break;
2146	default:
2147		err = -ENOIOCTLCMD;
2148		break;
2149	}
2150	return err;
2151}
2152
2153static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2154{
2155	struct sock *sk = sock->sk;
2156	unsigned int mask;
2157
2158	sock_poll_wait(file, sk_sleep(sk), wait);
2159	mask = 0;
2160
2161	/* exceptional events? */
2162	if (sk->sk_err)
2163		mask |= POLLERR;
2164	if (sk->sk_shutdown == SHUTDOWN_MASK)
2165		mask |= POLLHUP;
2166	if (sk->sk_shutdown & RCV_SHUTDOWN)
2167		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2168
2169	/* readable? */
2170	if (!skb_queue_empty(&sk->sk_receive_queue))
2171		mask |= POLLIN | POLLRDNORM;
2172
2173	/* Connection-based need to check for termination and startup */
2174	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2175	    sk->sk_state == TCP_CLOSE)
2176		mask |= POLLHUP;
2177
2178	/*
2179	 * we set writable also when the other side has shut down the
2180	 * connection. This prevents stuck sockets.
2181	 */
2182	if (unix_writable(sk))
2183		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2184
2185	return mask;
2186}
2187
2188static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2189				    poll_table *wait)
2190{
2191	struct sock *sk = sock->sk, *other;
2192	unsigned int mask, writable;
2193
2194	sock_poll_wait(file, sk_sleep(sk), wait);
2195	mask = 0;
2196
2197	/* exceptional events? */
2198	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2199		mask |= POLLERR;
2200	if (sk->sk_shutdown & RCV_SHUTDOWN)
2201		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2202	if (sk->sk_shutdown == SHUTDOWN_MASK)
2203		mask |= POLLHUP;
2204
2205	/* readable? */
2206	if (!skb_queue_empty(&sk->sk_receive_queue))
2207		mask |= POLLIN | POLLRDNORM;
2208
2209	/* Connection-based need to check for termination and startup */
2210	if (sk->sk_type == SOCK_SEQPACKET) {
2211		if (sk->sk_state == TCP_CLOSE)
2212			mask |= POLLHUP;
2213		/* connection hasn't started yet? */
2214		if (sk->sk_state == TCP_SYN_SENT)
2215			return mask;
2216	}
2217
2218	/* No write status requested, avoid expensive OUT tests. */
2219	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2220		return mask;
2221
2222	writable = unix_writable(sk);
2223	other = unix_peer_get(sk);
2224	if (other) {
2225		if (unix_peer(other) != sk) {
2226			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2227			if (unix_recvq_full(other))
2228				writable = 0;
2229		}
2230		sock_put(other);
2231	}
2232
2233	if (writable)
2234		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2235	else
2236		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2237
2238	return mask;
2239}
2240
2241#ifdef CONFIG_PROC_FS
2242
2243#define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2244
2245#define get_bucket(x) ((x) >> BUCKET_SPACE)
2246#define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2247#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2248
2249static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2250{
2251	unsigned long offset = get_offset(*pos);
2252	unsigned long bucket = get_bucket(*pos);
2253	struct sock *sk;
2254	unsigned long count = 0;
2255
2256	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2257		if (sock_net(sk) != seq_file_net(seq))
2258			continue;
2259		if (++count == offset)
2260			break;
2261	}
2262
2263	return sk;
2264}
2265
2266static struct sock *unix_next_socket(struct seq_file *seq,
2267				     struct sock *sk,
2268				     loff_t *pos)
2269{
2270	unsigned long bucket;
2271
2272	while (sk > (struct sock *)SEQ_START_TOKEN) {
2273		sk = sk_next(sk);
2274		if (!sk)
2275			goto next_bucket;
2276		if (sock_net(sk) == seq_file_net(seq))
2277			return sk;
2278	}
2279
2280	do {
2281		sk = unix_from_bucket(seq, pos);
2282		if (sk)
2283			return sk;
2284
2285next_bucket:
2286		bucket = get_bucket(*pos) + 1;
2287		*pos = set_bucket_offset(bucket, 1);
2288	} while (bucket < ARRAY_SIZE(unix_socket_table));
2289
2290	return NULL;
2291}
2292
2293static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2294	__acquires(unix_table_lock)
2295{
2296	spin_lock(&unix_table_lock);
2297
2298	if (!*pos)
2299		return SEQ_START_TOKEN;
2300
2301	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2302		return NULL;
2303
2304	return unix_next_socket(seq, NULL, pos);
2305}
2306
2307static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2308{
2309	++*pos;
2310	return unix_next_socket(seq, v, pos);
2311}
2312
2313static void unix_seq_stop(struct seq_file *seq, void *v)
2314	__releases(unix_table_lock)
2315{
2316	spin_unlock(&unix_table_lock);
2317}
2318
2319static int unix_seq_show(struct seq_file *seq, void *v)
2320{
2321
2322	if (v == SEQ_START_TOKEN)
2323		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2324			 "Inode Path\n");
2325	else {
2326		struct sock *s = v;
2327		struct unix_sock *u = unix_sk(s);
2328		unix_state_lock(s);
2329
2330		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2331			s,
2332			atomic_read(&s->sk_refcnt),
2333			0,
2334			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2335			s->sk_type,
2336			s->sk_socket ?
2337			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2338			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2339			sock_i_ino(s));
2340
2341		if (u->addr) {
2342			int i, len;
2343			seq_putc(seq, ' ');
2344
2345			i = 0;
2346			len = u->addr->len - sizeof(short);
2347			if (!UNIX_ABSTRACT(s))
2348				len--;
2349			else {
2350				seq_putc(seq, '@');
2351				i++;
2352			}
2353			for ( ; i < len; i++)
2354				seq_putc(seq, u->addr->name->sun_path[i]);
2355		}
2356		unix_state_unlock(s);
2357		seq_putc(seq, '\n');
2358	}
2359
2360	return 0;
2361}
2362
2363static const struct seq_operations unix_seq_ops = {
2364	.start  = unix_seq_start,
2365	.next   = unix_seq_next,
2366	.stop   = unix_seq_stop,
2367	.show   = unix_seq_show,
2368};
2369
2370static int unix_seq_open(struct inode *inode, struct file *file)
2371{
2372	return seq_open_net(inode, file, &unix_seq_ops,
2373			    sizeof(struct seq_net_private));
2374}
2375
2376static const struct file_operations unix_seq_fops = {
2377	.owner		= THIS_MODULE,
2378	.open		= unix_seq_open,
2379	.read		= seq_read,
2380	.llseek		= seq_lseek,
2381	.release	= seq_release_net,
2382};
2383
2384#endif
2385
2386static const struct net_proto_family unix_family_ops = {
2387	.family = PF_UNIX,
2388	.create = unix_create,
2389	.owner	= THIS_MODULE,
2390};
2391
2392
2393static int __net_init unix_net_init(struct net *net)
2394{
2395	int error = -ENOMEM;
2396
2397	net->unx.sysctl_max_dgram_qlen = 10;
2398	if (unix_sysctl_register(net))
2399		goto out;
2400
2401#ifdef CONFIG_PROC_FS
2402	if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2403		unix_sysctl_unregister(net);
2404		goto out;
2405	}
2406#endif
2407	error = 0;
2408out:
2409	return error;
2410}
2411
2412static void __net_exit unix_net_exit(struct net *net)
2413{
2414	unix_sysctl_unregister(net);
2415	remove_proc_entry("unix", net->proc_net);
2416}
2417
2418static struct pernet_operations unix_net_ops = {
2419	.init = unix_net_init,
2420	.exit = unix_net_exit,
2421};
2422
2423static int __init af_unix_init(void)
2424{
2425	int rc = -1;
2426
2427	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2428
2429	rc = proto_register(&unix_proto, 1);
2430	if (rc != 0) {
2431		printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2432		       __func__);
2433		goto out;
2434	}
2435
2436	sock_register(&unix_family_ops);
2437	register_pernet_subsys(&unix_net_ops);
2438out:
2439	return rc;
2440}
2441
2442static void __exit af_unix_exit(void)
2443{
2444	sock_unregister(PF_UNIX);
2445	proto_unregister(&unix_proto);
2446	unregister_pernet_subsys(&unix_net_ops);
2447}
2448
2449/* Earlier than device_initcall() so that other drivers invoking
2450   request_module() don't end up in a loop when modprobe tries
2451   to use a UNIX socket. But later than subsys_initcall() because
2452   we depend on stuff initialised there */
2453fs_initcall(af_unix_init);
2454module_exit(af_unix_exit);
2455
2456MODULE_LICENSE("GPL");
2457MODULE_ALIAS_NETPROTO(PF_UNIX);
2458