af_unix.c revision 877ce7c1b3afd69a9b1caeb1b9964c992641f52a
1/*
2 * NET4:	Implementation of BSD Unix domain sockets.
3 *
4 * Authors:	Alan Cox, <alan.cox@linux.org>
5 *
6 *		This program is free software; you can redistribute it and/or
7 *		modify it under the terms of the GNU General Public License
8 *		as published by the Free Software Foundation; either version
9 *		2 of the License, or (at your option) any later version.
10 *
11 * Version:	$Id: af_unix.c,v 1.133 2002/02/08 03:57:19 davem Exp $
12 *
13 * Fixes:
14 *		Linus Torvalds	:	Assorted bug cures.
15 *		Niibe Yutaka	:	async I/O support.
16 *		Carsten Paeth	:	PF_UNIX check, address fixes.
17 *		Alan Cox	:	Limit size of allocated blocks.
18 *		Alan Cox	:	Fixed the stupid socketpair bug.
19 *		Alan Cox	:	BSD compatibility fine tuning.
20 *		Alan Cox	:	Fixed a bug in connect when interrupted.
21 *		Alan Cox	:	Sorted out a proper draft version of
22 *					file descriptor passing hacked up from
23 *					Mike Shaver's work.
24 *		Marty Leisner	:	Fixes to fd passing
25 *		Nick Nevin	:	recvmsg bugfix.
26 *		Alan Cox	:	Started proper garbage collector
27 *		Heiko EiBfeldt	:	Missing verify_area check
28 *		Alan Cox	:	Started POSIXisms
29 *		Andreas Schwab	:	Replace inode by dentry for proper
30 *					reference counting
31 *		Kirk Petersen	:	Made this a module
32 *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
33 *					Lots of bug fixes.
34 *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
35 *					by above two patches.
36 *	     Andrea Arcangeli	:	If possible we block in connect(2)
37 *					if the max backlog of the listen socket
38 *					is been reached. This won't break
39 *					old apps and it will avoid huge amount
40 *					of socks hashed (this for unix_gc()
41 *					performances reasons).
42 *					Security fix that limits the max
43 *					number of socks to 2*max_files and
44 *					the number of skb queueable in the
45 *					dgram receiver.
46 *		Artur Skawina   :	Hash function optimizations
47 *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
48 *	      Malcolm Beattie   :	Set peercred for socketpair
49 *	     Michal Ostrowski   :       Module initialization cleanup.
50 *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
51 *	     				the core infrastructure is doing that
52 *	     				for all net proto families now (2.5.69+)
53 *
54 *
55 * Known differences from reference BSD that was tested:
56 *
57 *	[TO FIX]
58 *	ECONNREFUSED is not returned from one end of a connected() socket to the
59 *		other the moment one end closes.
60 *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
61 *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
62 *	[NOT TO FIX]
63 *	accept() returns a path name even if the connecting socket has closed
64 *		in the meantime (BSD loses the path and gives up).
65 *	accept() returns 0 length path for an unbound connector. BSD returns 16
66 *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
67 *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
68 *	BSD af_unix apparently has connect forgetting to block properly.
69 *		(need to check this with the POSIX spec in detail)
70 *
71 * Differences from 2.0.0-11-... (ANK)
72 *	Bug fixes and improvements.
73 *		- client shutdown killed server socket.
74 *		- removed all useless cli/sti pairs.
75 *
76 *	Semantic changes/extensions.
77 *		- generic control message passing.
78 *		- SCM_CREDENTIALS control message.
79 *		- "Abstract" (not FS based) socket bindings.
80 *		  Abstract names are sequences of bytes (not zero terminated)
81 *		  started by 0, so that this name space does not intersect
82 *		  with BSD names.
83 */
84
85#include <linux/module.h>
86#include <linux/config.h>
87#include <linux/kernel.h>
88#include <linux/signal.h>
89#include <linux/sched.h>
90#include <linux/errno.h>
91#include <linux/string.h>
92#include <linux/stat.h>
93#include <linux/dcache.h>
94#include <linux/namei.h>
95#include <linux/socket.h>
96#include <linux/un.h>
97#include <linux/fcntl.h>
98#include <linux/termios.h>
99#include <linux/sockios.h>
100#include <linux/net.h>
101#include <linux/in.h>
102#include <linux/fs.h>
103#include <linux/slab.h>
104#include <asm/uaccess.h>
105#include <linux/skbuff.h>
106#include <linux/netdevice.h>
107#include <net/sock.h>
108#include <net/tcp_states.h>
109#include <net/af_unix.h>
110#include <linux/proc_fs.h>
111#include <linux/seq_file.h>
112#include <net/scm.h>
113#include <linux/init.h>
114#include <linux/poll.h>
115#include <linux/smp_lock.h>
116#include <linux/rtnetlink.h>
117#include <linux/mount.h>
118#include <net/checksum.h>
119#include <linux/security.h>
120
121int sysctl_unix_max_dgram_qlen = 10;
122
123struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
124DEFINE_SPINLOCK(unix_table_lock);
125static atomic_t unix_nr_socks = ATOMIC_INIT(0);
126
127#define unix_sockets_unbound	(&unix_socket_table[UNIX_HASH_SIZE])
128
129#define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
130
131#ifdef CONFIG_SECURITY_NETWORK
132static void unix_get_peersec_dgram(struct sk_buff *skb)
133{
134	int err;
135
136	err = security_socket_getpeersec_dgram(skb, UNIXSECDATA(skb),
137					       UNIXSECLEN(skb));
138	if (err)
139		*(UNIXSECDATA(skb)) = NULL;
140}
141
142static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
143{
144	scm->secdata = *UNIXSECDATA(skb);
145	scm->seclen = *UNIXSECLEN(skb);
146}
147#else
148static void unix_get_peersec_dgram(struct sk_buff *skb)
149{ }
150
151static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
152{ }
153#endif /* CONFIG_SECURITY_NETWORK */
154
155/*
156 *  SMP locking strategy:
157 *    hash table is protected with spinlock unix_table_lock
158 *    each socket state is protected by separate rwlock.
159 */
160
161static inline unsigned unix_hash_fold(unsigned hash)
162{
163	hash ^= hash>>16;
164	hash ^= hash>>8;
165	return hash&(UNIX_HASH_SIZE-1);
166}
167
168#define unix_peer(sk) (unix_sk(sk)->peer)
169
170static inline int unix_our_peer(struct sock *sk, struct sock *osk)
171{
172	return unix_peer(osk) == sk;
173}
174
175static inline int unix_may_send(struct sock *sk, struct sock *osk)
176{
177	return (unix_peer(osk) == NULL || unix_our_peer(sk, osk));
178}
179
180static struct sock *unix_peer_get(struct sock *s)
181{
182	struct sock *peer;
183
184	unix_state_rlock(s);
185	peer = unix_peer(s);
186	if (peer)
187		sock_hold(peer);
188	unix_state_runlock(s);
189	return peer;
190}
191
192static inline void unix_release_addr(struct unix_address *addr)
193{
194	if (atomic_dec_and_test(&addr->refcnt))
195		kfree(addr);
196}
197
198/*
199 *	Check unix socket name:
200 *		- should be not zero length.
201 *	        - if started by not zero, should be NULL terminated (FS object)
202 *		- if started by zero, it is abstract name.
203 */
204
205static int unix_mkname(struct sockaddr_un * sunaddr, int len, unsigned *hashp)
206{
207	if (len <= sizeof(short) || len > sizeof(*sunaddr))
208		return -EINVAL;
209	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
210		return -EINVAL;
211	if (sunaddr->sun_path[0]) {
212		/*
213		 * This may look like an off by one error but it is a bit more
214		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
215		 * sun_path[108] doesnt as such exist.  However in kernel space
216		 * we are guaranteed that it is a valid memory location in our
217		 * kernel address buffer.
218		 */
219		((char *)sunaddr)[len]=0;
220		len = strlen(sunaddr->sun_path)+1+sizeof(short);
221		return len;
222	}
223
224	*hashp = unix_hash_fold(csum_partial((char*)sunaddr, len, 0));
225	return len;
226}
227
228static void __unix_remove_socket(struct sock *sk)
229{
230	sk_del_node_init(sk);
231}
232
233static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
234{
235	BUG_TRAP(sk_unhashed(sk));
236	sk_add_node(sk, list);
237}
238
239static inline void unix_remove_socket(struct sock *sk)
240{
241	spin_lock(&unix_table_lock);
242	__unix_remove_socket(sk);
243	spin_unlock(&unix_table_lock);
244}
245
246static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
247{
248	spin_lock(&unix_table_lock);
249	__unix_insert_socket(list, sk);
250	spin_unlock(&unix_table_lock);
251}
252
253static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
254					      int len, int type, unsigned hash)
255{
256	struct sock *s;
257	struct hlist_node *node;
258
259	sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
260		struct unix_sock *u = unix_sk(s);
261
262		if (u->addr->len == len &&
263		    !memcmp(u->addr->name, sunname, len))
264			goto found;
265	}
266	s = NULL;
267found:
268	return s;
269}
270
271static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname,
272						   int len, int type,
273						   unsigned hash)
274{
275	struct sock *s;
276
277	spin_lock(&unix_table_lock);
278	s = __unix_find_socket_byname(sunname, len, type, hash);
279	if (s)
280		sock_hold(s);
281	spin_unlock(&unix_table_lock);
282	return s;
283}
284
285static struct sock *unix_find_socket_byinode(struct inode *i)
286{
287	struct sock *s;
288	struct hlist_node *node;
289
290	spin_lock(&unix_table_lock);
291	sk_for_each(s, node,
292		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
293		struct dentry *dentry = unix_sk(s)->dentry;
294
295		if(dentry && dentry->d_inode == i)
296		{
297			sock_hold(s);
298			goto found;
299		}
300	}
301	s = NULL;
302found:
303	spin_unlock(&unix_table_lock);
304	return s;
305}
306
307static inline int unix_writable(struct sock *sk)
308{
309	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
310}
311
312static void unix_write_space(struct sock *sk)
313{
314	read_lock(&sk->sk_callback_lock);
315	if (unix_writable(sk)) {
316		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
317			wake_up_interruptible(sk->sk_sleep);
318		sk_wake_async(sk, 2, POLL_OUT);
319	}
320	read_unlock(&sk->sk_callback_lock);
321}
322
323/* When dgram socket disconnects (or changes its peer), we clear its receive
324 * queue of packets arrived from previous peer. First, it allows to do
325 * flow control based only on wmem_alloc; second, sk connected to peer
326 * may receive messages only from that peer. */
327static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
328{
329	if (!skb_queue_empty(&sk->sk_receive_queue)) {
330		skb_queue_purge(&sk->sk_receive_queue);
331		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
332
333		/* If one link of bidirectional dgram pipe is disconnected,
334		 * we signal error. Messages are lost. Do not make this,
335		 * when peer was not connected to us.
336		 */
337		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
338			other->sk_err = ECONNRESET;
339			other->sk_error_report(other);
340		}
341	}
342}
343
344static void unix_sock_destructor(struct sock *sk)
345{
346	struct unix_sock *u = unix_sk(sk);
347
348	skb_queue_purge(&sk->sk_receive_queue);
349
350	BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
351	BUG_TRAP(sk_unhashed(sk));
352	BUG_TRAP(!sk->sk_socket);
353	if (!sock_flag(sk, SOCK_DEAD)) {
354		printk("Attempt to release alive unix socket: %p\n", sk);
355		return;
356	}
357
358	if (u->addr)
359		unix_release_addr(u->addr);
360
361	atomic_dec(&unix_nr_socks);
362#ifdef UNIX_REFCNT_DEBUG
363	printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, atomic_read(&unix_nr_socks));
364#endif
365}
366
367static int unix_release_sock (struct sock *sk, int embrion)
368{
369	struct unix_sock *u = unix_sk(sk);
370	struct dentry *dentry;
371	struct vfsmount *mnt;
372	struct sock *skpair;
373	struct sk_buff *skb;
374	int state;
375
376	unix_remove_socket(sk);
377
378	/* Clear state */
379	unix_state_wlock(sk);
380	sock_orphan(sk);
381	sk->sk_shutdown = SHUTDOWN_MASK;
382	dentry	     = u->dentry;
383	u->dentry    = NULL;
384	mnt	     = u->mnt;
385	u->mnt	     = NULL;
386	state = sk->sk_state;
387	sk->sk_state = TCP_CLOSE;
388	unix_state_wunlock(sk);
389
390	wake_up_interruptible_all(&u->peer_wait);
391
392	skpair=unix_peer(sk);
393
394	if (skpair!=NULL) {
395		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
396			unix_state_wlock(skpair);
397			/* No more writes */
398			skpair->sk_shutdown = SHUTDOWN_MASK;
399			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
400				skpair->sk_err = ECONNRESET;
401			unix_state_wunlock(skpair);
402			skpair->sk_state_change(skpair);
403			read_lock(&skpair->sk_callback_lock);
404			sk_wake_async(skpair,1,POLL_HUP);
405			read_unlock(&skpair->sk_callback_lock);
406		}
407		sock_put(skpair); /* It may now die */
408		unix_peer(sk) = NULL;
409	}
410
411	/* Try to flush out this socket. Throw out buffers at least */
412
413	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
414		if (state==TCP_LISTEN)
415			unix_release_sock(skb->sk, 1);
416		/* passed fds are erased in the kfree_skb hook	      */
417		kfree_skb(skb);
418	}
419
420	if (dentry) {
421		dput(dentry);
422		mntput(mnt);
423	}
424
425	sock_put(sk);
426
427	/* ---- Socket is dead now and most probably destroyed ---- */
428
429	/*
430	 * Fixme: BSD difference: In BSD all sockets connected to use get
431	 *	  ECONNRESET and we die on the spot. In Linux we behave
432	 *	  like files and pipes do and wait for the last
433	 *	  dereference.
434	 *
435	 * Can't we simply set sock->err?
436	 *
437	 *	  What the above comment does talk about? --ANK(980817)
438	 */
439
440	if (atomic_read(&unix_tot_inflight))
441		unix_gc();		/* Garbage collect fds */
442
443	return 0;
444}
445
446static int unix_listen(struct socket *sock, int backlog)
447{
448	int err;
449	struct sock *sk = sock->sk;
450	struct unix_sock *u = unix_sk(sk);
451
452	err = -EOPNOTSUPP;
453	if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
454		goto out;			/* Only stream/seqpacket sockets accept */
455	err = -EINVAL;
456	if (!u->addr)
457		goto out;			/* No listens on an unbound socket */
458	unix_state_wlock(sk);
459	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
460		goto out_unlock;
461	if (backlog > sk->sk_max_ack_backlog)
462		wake_up_interruptible_all(&u->peer_wait);
463	sk->sk_max_ack_backlog	= backlog;
464	sk->sk_state		= TCP_LISTEN;
465	/* set credentials so connect can copy them */
466	sk->sk_peercred.pid	= current->tgid;
467	sk->sk_peercred.uid	= current->euid;
468	sk->sk_peercred.gid	= current->egid;
469	err = 0;
470
471out_unlock:
472	unix_state_wunlock(sk);
473out:
474	return err;
475}
476
477static int unix_release(struct socket *);
478static int unix_bind(struct socket *, struct sockaddr *, int);
479static int unix_stream_connect(struct socket *, struct sockaddr *,
480			       int addr_len, int flags);
481static int unix_socketpair(struct socket *, struct socket *);
482static int unix_accept(struct socket *, struct socket *, int);
483static int unix_getname(struct socket *, struct sockaddr *, int *, int);
484static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
485static int unix_ioctl(struct socket *, unsigned int, unsigned long);
486static int unix_shutdown(struct socket *, int);
487static int unix_stream_sendmsg(struct kiocb *, struct socket *,
488			       struct msghdr *, size_t);
489static int unix_stream_recvmsg(struct kiocb *, struct socket *,
490			       struct msghdr *, size_t, int);
491static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
492			      struct msghdr *, size_t);
493static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
494			      struct msghdr *, size_t, int);
495static int unix_dgram_connect(struct socket *, struct sockaddr *,
496			      int, int);
497static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
498				  struct msghdr *, size_t);
499
500static const struct proto_ops unix_stream_ops = {
501	.family =	PF_UNIX,
502	.owner =	THIS_MODULE,
503	.release =	unix_release,
504	.bind =		unix_bind,
505	.connect =	unix_stream_connect,
506	.socketpair =	unix_socketpair,
507	.accept =	unix_accept,
508	.getname =	unix_getname,
509	.poll =		unix_poll,
510	.ioctl =	unix_ioctl,
511	.listen =	unix_listen,
512	.shutdown =	unix_shutdown,
513	.setsockopt =	sock_no_setsockopt,
514	.getsockopt =	sock_no_getsockopt,
515	.sendmsg =	unix_stream_sendmsg,
516	.recvmsg =	unix_stream_recvmsg,
517	.mmap =		sock_no_mmap,
518	.sendpage =	sock_no_sendpage,
519};
520
521static const struct proto_ops unix_dgram_ops = {
522	.family =	PF_UNIX,
523	.owner =	THIS_MODULE,
524	.release =	unix_release,
525	.bind =		unix_bind,
526	.connect =	unix_dgram_connect,
527	.socketpair =	unix_socketpair,
528	.accept =	sock_no_accept,
529	.getname =	unix_getname,
530	.poll =		datagram_poll,
531	.ioctl =	unix_ioctl,
532	.listen =	sock_no_listen,
533	.shutdown =	unix_shutdown,
534	.setsockopt =	sock_no_setsockopt,
535	.getsockopt =	sock_no_getsockopt,
536	.sendmsg =	unix_dgram_sendmsg,
537	.recvmsg =	unix_dgram_recvmsg,
538	.mmap =		sock_no_mmap,
539	.sendpage =	sock_no_sendpage,
540};
541
542static const struct proto_ops unix_seqpacket_ops = {
543	.family =	PF_UNIX,
544	.owner =	THIS_MODULE,
545	.release =	unix_release,
546	.bind =		unix_bind,
547	.connect =	unix_stream_connect,
548	.socketpair =	unix_socketpair,
549	.accept =	unix_accept,
550	.getname =	unix_getname,
551	.poll =		datagram_poll,
552	.ioctl =	unix_ioctl,
553	.listen =	unix_listen,
554	.shutdown =	unix_shutdown,
555	.setsockopt =	sock_no_setsockopt,
556	.getsockopt =	sock_no_getsockopt,
557	.sendmsg =	unix_seqpacket_sendmsg,
558	.recvmsg =	unix_dgram_recvmsg,
559	.mmap =		sock_no_mmap,
560	.sendpage =	sock_no_sendpage,
561};
562
563static struct proto unix_proto = {
564	.name	  = "UNIX",
565	.owner	  = THIS_MODULE,
566	.obj_size = sizeof(struct unix_sock),
567};
568
569static struct sock * unix_create1(struct socket *sock)
570{
571	struct sock *sk = NULL;
572	struct unix_sock *u;
573
574	if (atomic_read(&unix_nr_socks) >= 2*get_max_files())
575		goto out;
576
577	sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1);
578	if (!sk)
579		goto out;
580
581	atomic_inc(&unix_nr_socks);
582
583	sock_init_data(sock,sk);
584
585	sk->sk_write_space	= unix_write_space;
586	sk->sk_max_ack_backlog	= sysctl_unix_max_dgram_qlen;
587	sk->sk_destruct		= unix_sock_destructor;
588	u	  = unix_sk(sk);
589	u->dentry = NULL;
590	u->mnt	  = NULL;
591	spin_lock_init(&u->lock);
592	atomic_set(&u->inflight, sock ? 0 : -1);
593	mutex_init(&u->readlock); /* single task reading lock */
594	init_waitqueue_head(&u->peer_wait);
595	unix_insert_socket(unix_sockets_unbound, sk);
596out:
597	return sk;
598}
599
600static int unix_create(struct socket *sock, int protocol)
601{
602	if (protocol && protocol != PF_UNIX)
603		return -EPROTONOSUPPORT;
604
605	sock->state = SS_UNCONNECTED;
606
607	switch (sock->type) {
608	case SOCK_STREAM:
609		sock->ops = &unix_stream_ops;
610		break;
611		/*
612		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
613		 *	nothing uses it.
614		 */
615	case SOCK_RAW:
616		sock->type=SOCK_DGRAM;
617	case SOCK_DGRAM:
618		sock->ops = &unix_dgram_ops;
619		break;
620	case SOCK_SEQPACKET:
621		sock->ops = &unix_seqpacket_ops;
622		break;
623	default:
624		return -ESOCKTNOSUPPORT;
625	}
626
627	return unix_create1(sock) ? 0 : -ENOMEM;
628}
629
630static int unix_release(struct socket *sock)
631{
632	struct sock *sk = sock->sk;
633
634	if (!sk)
635		return 0;
636
637	sock->sk = NULL;
638
639	return unix_release_sock (sk, 0);
640}
641
642static int unix_autobind(struct socket *sock)
643{
644	struct sock *sk = sock->sk;
645	struct unix_sock *u = unix_sk(sk);
646	static u32 ordernum = 1;
647	struct unix_address * addr;
648	int err;
649
650	mutex_lock(&u->readlock);
651
652	err = 0;
653	if (u->addr)
654		goto out;
655
656	err = -ENOMEM;
657	addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
658	if (!addr)
659		goto out;
660
661	memset(addr, 0, sizeof(*addr) + sizeof(short) + 16);
662	addr->name->sun_family = AF_UNIX;
663	atomic_set(&addr->refcnt, 1);
664
665retry:
666	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
667	addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
668
669	spin_lock(&unix_table_lock);
670	ordernum = (ordernum+1)&0xFFFFF;
671
672	if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
673				      addr->hash)) {
674		spin_unlock(&unix_table_lock);
675		/* Sanity yield. It is unusual case, but yet... */
676		if (!(ordernum&0xFF))
677			yield();
678		goto retry;
679	}
680	addr->hash ^= sk->sk_type;
681
682	__unix_remove_socket(sk);
683	u->addr = addr;
684	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
685	spin_unlock(&unix_table_lock);
686	err = 0;
687
688out:	mutex_unlock(&u->readlock);
689	return err;
690}
691
692static struct sock *unix_find_other(struct sockaddr_un *sunname, int len,
693				    int type, unsigned hash, int *error)
694{
695	struct sock *u;
696	struct nameidata nd;
697	int err = 0;
698
699	if (sunname->sun_path[0]) {
700		err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
701		if (err)
702			goto fail;
703		err = vfs_permission(&nd, MAY_WRITE);
704		if (err)
705			goto put_fail;
706
707		err = -ECONNREFUSED;
708		if (!S_ISSOCK(nd.dentry->d_inode->i_mode))
709			goto put_fail;
710		u=unix_find_socket_byinode(nd.dentry->d_inode);
711		if (!u)
712			goto put_fail;
713
714		if (u->sk_type == type)
715			touch_atime(nd.mnt, nd.dentry);
716
717		path_release(&nd);
718
719		err=-EPROTOTYPE;
720		if (u->sk_type != type) {
721			sock_put(u);
722			goto fail;
723		}
724	} else {
725		err = -ECONNREFUSED;
726		u=unix_find_socket_byname(sunname, len, type, hash);
727		if (u) {
728			struct dentry *dentry;
729			dentry = unix_sk(u)->dentry;
730			if (dentry)
731				touch_atime(unix_sk(u)->mnt, dentry);
732		} else
733			goto fail;
734	}
735	return u;
736
737put_fail:
738	path_release(&nd);
739fail:
740	*error=err;
741	return NULL;
742}
743
744
745static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
746{
747	struct sock *sk = sock->sk;
748	struct unix_sock *u = unix_sk(sk);
749	struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
750	struct dentry * dentry = NULL;
751	struct nameidata nd;
752	int err;
753	unsigned hash;
754	struct unix_address *addr;
755	struct hlist_head *list;
756
757	err = -EINVAL;
758	if (sunaddr->sun_family != AF_UNIX)
759		goto out;
760
761	if (addr_len==sizeof(short)) {
762		err = unix_autobind(sock);
763		goto out;
764	}
765
766	err = unix_mkname(sunaddr, addr_len, &hash);
767	if (err < 0)
768		goto out;
769	addr_len = err;
770
771	mutex_lock(&u->readlock);
772
773	err = -EINVAL;
774	if (u->addr)
775		goto out_up;
776
777	err = -ENOMEM;
778	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
779	if (!addr)
780		goto out_up;
781
782	memcpy(addr->name, sunaddr, addr_len);
783	addr->len = addr_len;
784	addr->hash = hash ^ sk->sk_type;
785	atomic_set(&addr->refcnt, 1);
786
787	if (sunaddr->sun_path[0]) {
788		unsigned int mode;
789		err = 0;
790		/*
791		 * Get the parent directory, calculate the hash for last
792		 * component.
793		 */
794		err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd);
795		if (err)
796			goto out_mknod_parent;
797
798		dentry = lookup_create(&nd, 0);
799		err = PTR_ERR(dentry);
800		if (IS_ERR(dentry))
801			goto out_mknod_unlock;
802
803		/*
804		 * All right, let's create it.
805		 */
806		mode = S_IFSOCK |
807		       (SOCK_INODE(sock)->i_mode & ~current->fs->umask);
808		err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
809		if (err)
810			goto out_mknod_dput;
811		mutex_unlock(&nd.dentry->d_inode->i_mutex);
812		dput(nd.dentry);
813		nd.dentry = dentry;
814
815		addr->hash = UNIX_HASH_SIZE;
816	}
817
818	spin_lock(&unix_table_lock);
819
820	if (!sunaddr->sun_path[0]) {
821		err = -EADDRINUSE;
822		if (__unix_find_socket_byname(sunaddr, addr_len,
823					      sk->sk_type, hash)) {
824			unix_release_addr(addr);
825			goto out_unlock;
826		}
827
828		list = &unix_socket_table[addr->hash];
829	} else {
830		list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
831		u->dentry = nd.dentry;
832		u->mnt    = nd.mnt;
833	}
834
835	err = 0;
836	__unix_remove_socket(sk);
837	u->addr = addr;
838	__unix_insert_socket(list, sk);
839
840out_unlock:
841	spin_unlock(&unix_table_lock);
842out_up:
843	mutex_unlock(&u->readlock);
844out:
845	return err;
846
847out_mknod_dput:
848	dput(dentry);
849out_mknod_unlock:
850	mutex_unlock(&nd.dentry->d_inode->i_mutex);
851	path_release(&nd);
852out_mknod_parent:
853	if (err==-EEXIST)
854		err=-EADDRINUSE;
855	unix_release_addr(addr);
856	goto out_up;
857}
858
859static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
860			      int alen, int flags)
861{
862	struct sock *sk = sock->sk;
863	struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr;
864	struct sock *other;
865	unsigned hash;
866	int err;
867
868	if (addr->sa_family != AF_UNSPEC) {
869		err = unix_mkname(sunaddr, alen, &hash);
870		if (err < 0)
871			goto out;
872		alen = err;
873
874		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
875		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
876			goto out;
877
878		other=unix_find_other(sunaddr, alen, sock->type, hash, &err);
879		if (!other)
880			goto out;
881
882		unix_state_wlock(sk);
883
884		err = -EPERM;
885		if (!unix_may_send(sk, other))
886			goto out_unlock;
887
888		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
889		if (err)
890			goto out_unlock;
891
892	} else {
893		/*
894		 *	1003.1g breaking connected state with AF_UNSPEC
895		 */
896		other = NULL;
897		unix_state_wlock(sk);
898	}
899
900	/*
901	 * If it was connected, reconnect.
902	 */
903	if (unix_peer(sk)) {
904		struct sock *old_peer = unix_peer(sk);
905		unix_peer(sk)=other;
906		unix_state_wunlock(sk);
907
908		if (other != old_peer)
909			unix_dgram_disconnected(sk, old_peer);
910		sock_put(old_peer);
911	} else {
912		unix_peer(sk)=other;
913		unix_state_wunlock(sk);
914	}
915 	return 0;
916
917out_unlock:
918	unix_state_wunlock(sk);
919	sock_put(other);
920out:
921	return err;
922}
923
924static long unix_wait_for_peer(struct sock *other, long timeo)
925{
926	struct unix_sock *u = unix_sk(other);
927	int sched;
928	DEFINE_WAIT(wait);
929
930	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
931
932	sched = !sock_flag(other, SOCK_DEAD) &&
933		!(other->sk_shutdown & RCV_SHUTDOWN) &&
934		(skb_queue_len(&other->sk_receive_queue) >
935		 other->sk_max_ack_backlog);
936
937	unix_state_runlock(other);
938
939	if (sched)
940		timeo = schedule_timeout(timeo);
941
942	finish_wait(&u->peer_wait, &wait);
943	return timeo;
944}
945
946static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
947			       int addr_len, int flags)
948{
949	struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
950	struct sock *sk = sock->sk;
951	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
952	struct sock *newsk = NULL;
953	struct sock *other = NULL;
954	struct sk_buff *skb = NULL;
955	unsigned hash;
956	int st;
957	int err;
958	long timeo;
959
960	err = unix_mkname(sunaddr, addr_len, &hash);
961	if (err < 0)
962		goto out;
963	addr_len = err;
964
965	if (test_bit(SOCK_PASSCRED, &sock->flags)
966		&& !u->addr && (err = unix_autobind(sock)) != 0)
967		goto out;
968
969	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
970
971	/* First of all allocate resources.
972	   If we will make it after state is locked,
973	   we will have to recheck all again in any case.
974	 */
975
976	err = -ENOMEM;
977
978	/* create new sock for complete connection */
979	newsk = unix_create1(NULL);
980	if (newsk == NULL)
981		goto out;
982
983	/* Allocate skb for sending to listening sock */
984	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
985	if (skb == NULL)
986		goto out;
987
988restart:
989	/*  Find listening sock. */
990	other = unix_find_other(sunaddr, addr_len, sk->sk_type, hash, &err);
991	if (!other)
992		goto out;
993
994	/* Latch state of peer */
995	unix_state_rlock(other);
996
997	/* Apparently VFS overslept socket death. Retry. */
998	if (sock_flag(other, SOCK_DEAD)) {
999		unix_state_runlock(other);
1000		sock_put(other);
1001		goto restart;
1002	}
1003
1004	err = -ECONNREFUSED;
1005	if (other->sk_state != TCP_LISTEN)
1006		goto out_unlock;
1007
1008	if (skb_queue_len(&other->sk_receive_queue) >
1009	    other->sk_max_ack_backlog) {
1010		err = -EAGAIN;
1011		if (!timeo)
1012			goto out_unlock;
1013
1014		timeo = unix_wait_for_peer(other, timeo);
1015
1016		err = sock_intr_errno(timeo);
1017		if (signal_pending(current))
1018			goto out;
1019		sock_put(other);
1020		goto restart;
1021        }
1022
1023	/* Latch our state.
1024
1025	   It is tricky place. We need to grab write lock and cannot
1026	   drop lock on peer. It is dangerous because deadlock is
1027	   possible. Connect to self case and simultaneous
1028	   attempt to connect are eliminated by checking socket
1029	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1030	   check this before attempt to grab lock.
1031
1032	   Well, and we have to recheck the state after socket locked.
1033	 */
1034	st = sk->sk_state;
1035
1036	switch (st) {
1037	case TCP_CLOSE:
1038		/* This is ok... continue with connect */
1039		break;
1040	case TCP_ESTABLISHED:
1041		/* Socket is already connected */
1042		err = -EISCONN;
1043		goto out_unlock;
1044	default:
1045		err = -EINVAL;
1046		goto out_unlock;
1047	}
1048
1049	unix_state_wlock(sk);
1050
1051	if (sk->sk_state != st) {
1052		unix_state_wunlock(sk);
1053		unix_state_runlock(other);
1054		sock_put(other);
1055		goto restart;
1056	}
1057
1058	err = security_unix_stream_connect(sock, other->sk_socket, newsk);
1059	if (err) {
1060		unix_state_wunlock(sk);
1061		goto out_unlock;
1062	}
1063
1064	/* The way is open! Fastly set all the necessary fields... */
1065
1066	sock_hold(sk);
1067	unix_peer(newsk)	= sk;
1068	newsk->sk_state		= TCP_ESTABLISHED;
1069	newsk->sk_type		= sk->sk_type;
1070	newsk->sk_peercred.pid	= current->tgid;
1071	newsk->sk_peercred.uid	= current->euid;
1072	newsk->sk_peercred.gid	= current->egid;
1073	newu = unix_sk(newsk);
1074	newsk->sk_sleep		= &newu->peer_wait;
1075	otheru = unix_sk(other);
1076
1077	/* copy address information from listening to new sock*/
1078	if (otheru->addr) {
1079		atomic_inc(&otheru->addr->refcnt);
1080		newu->addr = otheru->addr;
1081	}
1082	if (otheru->dentry) {
1083		newu->dentry	= dget(otheru->dentry);
1084		newu->mnt	= mntget(otheru->mnt);
1085	}
1086
1087	/* Set credentials */
1088	sk->sk_peercred = other->sk_peercred;
1089
1090	sock->state	= SS_CONNECTED;
1091	sk->sk_state	= TCP_ESTABLISHED;
1092	sock_hold(newsk);
1093
1094	smp_mb__after_atomic_inc();	/* sock_hold() does an atomic_inc() */
1095	unix_peer(sk)	= newsk;
1096
1097	unix_state_wunlock(sk);
1098
1099	/* take ten and and send info to listening sock */
1100	spin_lock(&other->sk_receive_queue.lock);
1101	__skb_queue_tail(&other->sk_receive_queue, skb);
1102	/* Undo artificially decreased inflight after embrion
1103	 * is installed to listening socket. */
1104	atomic_inc(&newu->inflight);
1105	spin_unlock(&other->sk_receive_queue.lock);
1106	unix_state_runlock(other);
1107	other->sk_data_ready(other, 0);
1108	sock_put(other);
1109	return 0;
1110
1111out_unlock:
1112	if (other)
1113		unix_state_runlock(other);
1114
1115out:
1116	if (skb)
1117		kfree_skb(skb);
1118	if (newsk)
1119		unix_release_sock(newsk, 0);
1120	if (other)
1121		sock_put(other);
1122	return err;
1123}
1124
1125static int unix_socketpair(struct socket *socka, struct socket *sockb)
1126{
1127	struct sock *ska=socka->sk, *skb = sockb->sk;
1128
1129	/* Join our sockets back to back */
1130	sock_hold(ska);
1131	sock_hold(skb);
1132	unix_peer(ska)=skb;
1133	unix_peer(skb)=ska;
1134	ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid;
1135	ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid;
1136	ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid;
1137
1138	if (ska->sk_type != SOCK_DGRAM) {
1139		ska->sk_state = TCP_ESTABLISHED;
1140		skb->sk_state = TCP_ESTABLISHED;
1141		socka->state  = SS_CONNECTED;
1142		sockb->state  = SS_CONNECTED;
1143	}
1144	return 0;
1145}
1146
1147static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1148{
1149	struct sock *sk = sock->sk;
1150	struct sock *tsk;
1151	struct sk_buff *skb;
1152	int err;
1153
1154	err = -EOPNOTSUPP;
1155	if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
1156		goto out;
1157
1158	err = -EINVAL;
1159	if (sk->sk_state != TCP_LISTEN)
1160		goto out;
1161
1162	/* If socket state is TCP_LISTEN it cannot change (for now...),
1163	 * so that no locks are necessary.
1164	 */
1165
1166	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1167	if (!skb) {
1168		/* This means receive shutdown. */
1169		if (err == 0)
1170			err = -EINVAL;
1171		goto out;
1172	}
1173
1174	tsk = skb->sk;
1175	skb_free_datagram(sk, skb);
1176	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1177
1178	/* attach accepted sock to socket */
1179	unix_state_wlock(tsk);
1180	newsock->state = SS_CONNECTED;
1181	sock_graft(tsk, newsock);
1182	unix_state_wunlock(tsk);
1183	return 0;
1184
1185out:
1186	return err;
1187}
1188
1189
1190static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1191{
1192	struct sock *sk = sock->sk;
1193	struct unix_sock *u;
1194	struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
1195	int err = 0;
1196
1197	if (peer) {
1198		sk = unix_peer_get(sk);
1199
1200		err = -ENOTCONN;
1201		if (!sk)
1202			goto out;
1203		err = 0;
1204	} else {
1205		sock_hold(sk);
1206	}
1207
1208	u = unix_sk(sk);
1209	unix_state_rlock(sk);
1210	if (!u->addr) {
1211		sunaddr->sun_family = AF_UNIX;
1212		sunaddr->sun_path[0] = 0;
1213		*uaddr_len = sizeof(short);
1214	} else {
1215		struct unix_address *addr = u->addr;
1216
1217		*uaddr_len = addr->len;
1218		memcpy(sunaddr, addr->name, *uaddr_len);
1219	}
1220	unix_state_runlock(sk);
1221	sock_put(sk);
1222out:
1223	return err;
1224}
1225
1226static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1227{
1228	int i;
1229
1230	scm->fp = UNIXCB(skb).fp;
1231	skb->destructor = sock_wfree;
1232	UNIXCB(skb).fp = NULL;
1233
1234	for (i=scm->fp->count-1; i>=0; i--)
1235		unix_notinflight(scm->fp->fp[i]);
1236}
1237
1238static void unix_destruct_fds(struct sk_buff *skb)
1239{
1240	struct scm_cookie scm;
1241	memset(&scm, 0, sizeof(scm));
1242	unix_detach_fds(&scm, skb);
1243
1244	/* Alas, it calls VFS */
1245	/* So fscking what? fput() had been SMP-safe since the last Summer */
1246	scm_destroy(&scm);
1247	sock_wfree(skb);
1248}
1249
1250static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1251{
1252	int i;
1253	for (i=scm->fp->count-1; i>=0; i--)
1254		unix_inflight(scm->fp->fp[i]);
1255	UNIXCB(skb).fp = scm->fp;
1256	skb->destructor = unix_destruct_fds;
1257	scm->fp = NULL;
1258}
1259
1260/*
1261 *	Send AF_UNIX data.
1262 */
1263
1264static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1265			      struct msghdr *msg, size_t len)
1266{
1267	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1268	struct sock *sk = sock->sk;
1269	struct unix_sock *u = unix_sk(sk);
1270	struct sockaddr_un *sunaddr=msg->msg_name;
1271	struct sock *other = NULL;
1272	int namelen = 0; /* fake GCC */
1273	int err;
1274	unsigned hash;
1275	struct sk_buff *skb;
1276	long timeo;
1277	struct scm_cookie tmp_scm;
1278
1279	if (NULL == siocb->scm)
1280		siocb->scm = &tmp_scm;
1281	err = scm_send(sock, msg, siocb->scm);
1282	if (err < 0)
1283		return err;
1284
1285	err = -EOPNOTSUPP;
1286	if (msg->msg_flags&MSG_OOB)
1287		goto out;
1288
1289	if (msg->msg_namelen) {
1290		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1291		if (err < 0)
1292			goto out;
1293		namelen = err;
1294	} else {
1295		sunaddr = NULL;
1296		err = -ENOTCONN;
1297		other = unix_peer_get(sk);
1298		if (!other)
1299			goto out;
1300	}
1301
1302	if (test_bit(SOCK_PASSCRED, &sock->flags)
1303		&& !u->addr && (err = unix_autobind(sock)) != 0)
1304		goto out;
1305
1306	err = -EMSGSIZE;
1307	if (len > sk->sk_sndbuf - 32)
1308		goto out;
1309
1310	skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err);
1311	if (skb==NULL)
1312		goto out;
1313
1314	memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1315	if (siocb->scm->fp)
1316		unix_attach_fds(siocb->scm, skb);
1317
1318	unix_get_peersec_dgram(skb);
1319
1320	skb->h.raw = skb->data;
1321	err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
1322	if (err)
1323		goto out_free;
1324
1325	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1326
1327restart:
1328	if (!other) {
1329		err = -ECONNRESET;
1330		if (sunaddr == NULL)
1331			goto out_free;
1332
1333		other = unix_find_other(sunaddr, namelen, sk->sk_type,
1334					hash, &err);
1335		if (other==NULL)
1336			goto out_free;
1337	}
1338
1339	unix_state_rlock(other);
1340	err = -EPERM;
1341	if (!unix_may_send(sk, other))
1342		goto out_unlock;
1343
1344	if (sock_flag(other, SOCK_DEAD)) {
1345		/*
1346		 *	Check with 1003.1g - what should
1347		 *	datagram error
1348		 */
1349		unix_state_runlock(other);
1350		sock_put(other);
1351
1352		err = 0;
1353		unix_state_wlock(sk);
1354		if (unix_peer(sk) == other) {
1355			unix_peer(sk)=NULL;
1356			unix_state_wunlock(sk);
1357
1358			unix_dgram_disconnected(sk, other);
1359			sock_put(other);
1360			err = -ECONNREFUSED;
1361		} else {
1362			unix_state_wunlock(sk);
1363		}
1364
1365		other = NULL;
1366		if (err)
1367			goto out_free;
1368		goto restart;
1369	}
1370
1371	err = -EPIPE;
1372	if (other->sk_shutdown & RCV_SHUTDOWN)
1373		goto out_unlock;
1374
1375	if (sk->sk_type != SOCK_SEQPACKET) {
1376		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1377		if (err)
1378			goto out_unlock;
1379	}
1380
1381	if (unix_peer(other) != sk &&
1382	    (skb_queue_len(&other->sk_receive_queue) >
1383	     other->sk_max_ack_backlog)) {
1384		if (!timeo) {
1385			err = -EAGAIN;
1386			goto out_unlock;
1387		}
1388
1389		timeo = unix_wait_for_peer(other, timeo);
1390
1391		err = sock_intr_errno(timeo);
1392		if (signal_pending(current))
1393			goto out_free;
1394
1395		goto restart;
1396	}
1397
1398	skb_queue_tail(&other->sk_receive_queue, skb);
1399	unix_state_runlock(other);
1400	other->sk_data_ready(other, len);
1401	sock_put(other);
1402	scm_destroy(siocb->scm);
1403	return len;
1404
1405out_unlock:
1406	unix_state_runlock(other);
1407out_free:
1408	kfree_skb(skb);
1409out:
1410	if (other)
1411		sock_put(other);
1412	scm_destroy(siocb->scm);
1413	return err;
1414}
1415
1416
1417static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1418			       struct msghdr *msg, size_t len)
1419{
1420	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1421	struct sock *sk = sock->sk;
1422	struct sock *other = NULL;
1423	struct sockaddr_un *sunaddr=msg->msg_name;
1424	int err,size;
1425	struct sk_buff *skb;
1426	int sent=0;
1427	struct scm_cookie tmp_scm;
1428
1429	if (NULL == siocb->scm)
1430		siocb->scm = &tmp_scm;
1431	err = scm_send(sock, msg, siocb->scm);
1432	if (err < 0)
1433		return err;
1434
1435	err = -EOPNOTSUPP;
1436	if (msg->msg_flags&MSG_OOB)
1437		goto out_err;
1438
1439	if (msg->msg_namelen) {
1440		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1441		goto out_err;
1442	} else {
1443		sunaddr = NULL;
1444		err = -ENOTCONN;
1445		other = unix_peer(sk);
1446		if (!other)
1447			goto out_err;
1448	}
1449
1450	if (sk->sk_shutdown & SEND_SHUTDOWN)
1451		goto pipe_err;
1452
1453	while(sent < len)
1454	{
1455		/*
1456		 *	Optimisation for the fact that under 0.01% of X
1457		 *	messages typically need breaking up.
1458		 */
1459
1460		size = len-sent;
1461
1462		/* Keep two messages in the pipe so it schedules better */
1463		if (size > ((sk->sk_sndbuf >> 1) - 64))
1464			size = (sk->sk_sndbuf >> 1) - 64;
1465
1466		if (size > SKB_MAX_ALLOC)
1467			size = SKB_MAX_ALLOC;
1468
1469		/*
1470		 *	Grab a buffer
1471		 */
1472
1473		skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err);
1474
1475		if (skb==NULL)
1476			goto out_err;
1477
1478		/*
1479		 *	If you pass two values to the sock_alloc_send_skb
1480		 *	it tries to grab the large buffer with GFP_NOFS
1481		 *	(which can fail easily), and if it fails grab the
1482		 *	fallback size buffer which is under a page and will
1483		 *	succeed. [Alan]
1484		 */
1485		size = min_t(int, size, skb_tailroom(skb));
1486
1487		memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1488		if (siocb->scm->fp)
1489			unix_attach_fds(siocb->scm, skb);
1490
1491		if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
1492			kfree_skb(skb);
1493			goto out_err;
1494		}
1495
1496		unix_state_rlock(other);
1497
1498		if (sock_flag(other, SOCK_DEAD) ||
1499		    (other->sk_shutdown & RCV_SHUTDOWN))
1500			goto pipe_err_free;
1501
1502		skb_queue_tail(&other->sk_receive_queue, skb);
1503		unix_state_runlock(other);
1504		other->sk_data_ready(other, size);
1505		sent+=size;
1506	}
1507
1508	scm_destroy(siocb->scm);
1509	siocb->scm = NULL;
1510
1511	return sent;
1512
1513pipe_err_free:
1514	unix_state_runlock(other);
1515	kfree_skb(skb);
1516pipe_err:
1517	if (sent==0 && !(msg->msg_flags&MSG_NOSIGNAL))
1518		send_sig(SIGPIPE,current,0);
1519	err = -EPIPE;
1520out_err:
1521	scm_destroy(siocb->scm);
1522	siocb->scm = NULL;
1523	return sent ? : err;
1524}
1525
1526static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1527				  struct msghdr *msg, size_t len)
1528{
1529	int err;
1530	struct sock *sk = sock->sk;
1531
1532	err = sock_error(sk);
1533	if (err)
1534		return err;
1535
1536	if (sk->sk_state != TCP_ESTABLISHED)
1537		return -ENOTCONN;
1538
1539	if (msg->msg_namelen)
1540		msg->msg_namelen = 0;
1541
1542	return unix_dgram_sendmsg(kiocb, sock, msg, len);
1543}
1544
1545static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1546{
1547	struct unix_sock *u = unix_sk(sk);
1548
1549	msg->msg_namelen = 0;
1550	if (u->addr) {
1551		msg->msg_namelen = u->addr->len;
1552		memcpy(msg->msg_name, u->addr->name, u->addr->len);
1553	}
1554}
1555
1556static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1557			      struct msghdr *msg, size_t size,
1558			      int flags)
1559{
1560	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1561	struct scm_cookie tmp_scm;
1562	struct sock *sk = sock->sk;
1563	struct unix_sock *u = unix_sk(sk);
1564	int noblock = flags & MSG_DONTWAIT;
1565	struct sk_buff *skb;
1566	int err;
1567
1568	err = -EOPNOTSUPP;
1569	if (flags&MSG_OOB)
1570		goto out;
1571
1572	msg->msg_namelen = 0;
1573
1574	mutex_lock(&u->readlock);
1575
1576	skb = skb_recv_datagram(sk, flags, noblock, &err);
1577	if (!skb)
1578		goto out_unlock;
1579
1580	wake_up_interruptible(&u->peer_wait);
1581
1582	if (msg->msg_name)
1583		unix_copy_addr(msg, skb->sk);
1584
1585	if (size > skb->len)
1586		size = skb->len;
1587	else if (size < skb->len)
1588		msg->msg_flags |= MSG_TRUNC;
1589
1590	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size);
1591	if (err)
1592		goto out_free;
1593
1594	if (!siocb->scm) {
1595		siocb->scm = &tmp_scm;
1596		memset(&tmp_scm, 0, sizeof(tmp_scm));
1597	}
1598	siocb->scm->creds = *UNIXCREDS(skb);
1599	unix_set_secdata(siocb->scm, skb);
1600
1601	if (!(flags & MSG_PEEK))
1602	{
1603		if (UNIXCB(skb).fp)
1604			unix_detach_fds(siocb->scm, skb);
1605	}
1606	else
1607	{
1608		/* It is questionable: on PEEK we could:
1609		   - do not return fds - good, but too simple 8)
1610		   - return fds, and do not return them on read (old strategy,
1611		     apparently wrong)
1612		   - clone fds (I chose it for now, it is the most universal
1613		     solution)
1614
1615	           POSIX 1003.1g does not actually define this clearly
1616	           at all. POSIX 1003.1g doesn't define a lot of things
1617	           clearly however!
1618
1619		*/
1620		if (UNIXCB(skb).fp)
1621			siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1622	}
1623	err = size;
1624
1625	scm_recv(sock, msg, siocb->scm, flags);
1626
1627out_free:
1628	skb_free_datagram(sk,skb);
1629out_unlock:
1630	mutex_unlock(&u->readlock);
1631out:
1632	return err;
1633}
1634
1635/*
1636 *	Sleep until data has arrive. But check for races..
1637 */
1638
1639static long unix_stream_data_wait(struct sock * sk, long timeo)
1640{
1641	DEFINE_WAIT(wait);
1642
1643	unix_state_rlock(sk);
1644
1645	for (;;) {
1646		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1647
1648		if (!skb_queue_empty(&sk->sk_receive_queue) ||
1649		    sk->sk_err ||
1650		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1651		    signal_pending(current) ||
1652		    !timeo)
1653			break;
1654
1655		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1656		unix_state_runlock(sk);
1657		timeo = schedule_timeout(timeo);
1658		unix_state_rlock(sk);
1659		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1660	}
1661
1662	finish_wait(sk->sk_sleep, &wait);
1663	unix_state_runlock(sk);
1664	return timeo;
1665}
1666
1667
1668
1669static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1670			       struct msghdr *msg, size_t size,
1671			       int flags)
1672{
1673	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1674	struct scm_cookie tmp_scm;
1675	struct sock *sk = sock->sk;
1676	struct unix_sock *u = unix_sk(sk);
1677	struct sockaddr_un *sunaddr=msg->msg_name;
1678	int copied = 0;
1679	int check_creds = 0;
1680	int target;
1681	int err = 0;
1682	long timeo;
1683
1684	err = -EINVAL;
1685	if (sk->sk_state != TCP_ESTABLISHED)
1686		goto out;
1687
1688	err = -EOPNOTSUPP;
1689	if (flags&MSG_OOB)
1690		goto out;
1691
1692	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1693	timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1694
1695	msg->msg_namelen = 0;
1696
1697	/* Lock the socket to prevent queue disordering
1698	 * while sleeps in memcpy_tomsg
1699	 */
1700
1701	if (!siocb->scm) {
1702		siocb->scm = &tmp_scm;
1703		memset(&tmp_scm, 0, sizeof(tmp_scm));
1704	}
1705
1706	mutex_lock(&u->readlock);
1707
1708	do
1709	{
1710		int chunk;
1711		struct sk_buff *skb;
1712
1713		skb = skb_dequeue(&sk->sk_receive_queue);
1714		if (skb==NULL)
1715		{
1716			if (copied >= target)
1717				break;
1718
1719			/*
1720			 *	POSIX 1003.1g mandates this order.
1721			 */
1722
1723			if ((err = sock_error(sk)) != 0)
1724				break;
1725			if (sk->sk_shutdown & RCV_SHUTDOWN)
1726				break;
1727			err = -EAGAIN;
1728			if (!timeo)
1729				break;
1730			mutex_unlock(&u->readlock);
1731
1732			timeo = unix_stream_data_wait(sk, timeo);
1733
1734			if (signal_pending(current)) {
1735				err = sock_intr_errno(timeo);
1736				goto out;
1737			}
1738			mutex_lock(&u->readlock);
1739			continue;
1740		}
1741
1742		if (check_creds) {
1743			/* Never glue messages from different writers */
1744			if (memcmp(UNIXCREDS(skb), &siocb->scm->creds, sizeof(siocb->scm->creds)) != 0) {
1745				skb_queue_head(&sk->sk_receive_queue, skb);
1746				break;
1747			}
1748		} else {
1749			/* Copy credentials */
1750			siocb->scm->creds = *UNIXCREDS(skb);
1751			check_creds = 1;
1752		}
1753
1754		/* Copy address just once */
1755		if (sunaddr)
1756		{
1757			unix_copy_addr(msg, skb->sk);
1758			sunaddr = NULL;
1759		}
1760
1761		chunk = min_t(unsigned int, skb->len, size);
1762		if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
1763			skb_queue_head(&sk->sk_receive_queue, skb);
1764			if (copied == 0)
1765				copied = -EFAULT;
1766			break;
1767		}
1768		copied += chunk;
1769		size -= chunk;
1770
1771		/* Mark read part of skb as used */
1772		if (!(flags & MSG_PEEK))
1773		{
1774			skb_pull(skb, chunk);
1775
1776			if (UNIXCB(skb).fp)
1777				unix_detach_fds(siocb->scm, skb);
1778
1779			/* put the skb back if we didn't use it up.. */
1780			if (skb->len)
1781			{
1782				skb_queue_head(&sk->sk_receive_queue, skb);
1783				break;
1784			}
1785
1786			kfree_skb(skb);
1787
1788			if (siocb->scm->fp)
1789				break;
1790		}
1791		else
1792		{
1793			/* It is questionable, see note in unix_dgram_recvmsg.
1794			 */
1795			if (UNIXCB(skb).fp)
1796				siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1797
1798			/* put message back and return */
1799			skb_queue_head(&sk->sk_receive_queue, skb);
1800			break;
1801		}
1802	} while (size);
1803
1804	mutex_unlock(&u->readlock);
1805	scm_recv(sock, msg, siocb->scm, flags);
1806out:
1807	return copied ? : err;
1808}
1809
1810static int unix_shutdown(struct socket *sock, int mode)
1811{
1812	struct sock *sk = sock->sk;
1813	struct sock *other;
1814
1815	mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
1816
1817	if (mode) {
1818		unix_state_wlock(sk);
1819		sk->sk_shutdown |= mode;
1820		other=unix_peer(sk);
1821		if (other)
1822			sock_hold(other);
1823		unix_state_wunlock(sk);
1824		sk->sk_state_change(sk);
1825
1826		if (other &&
1827			(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
1828
1829			int peer_mode = 0;
1830
1831			if (mode&RCV_SHUTDOWN)
1832				peer_mode |= SEND_SHUTDOWN;
1833			if (mode&SEND_SHUTDOWN)
1834				peer_mode |= RCV_SHUTDOWN;
1835			unix_state_wlock(other);
1836			other->sk_shutdown |= peer_mode;
1837			unix_state_wunlock(other);
1838			other->sk_state_change(other);
1839			read_lock(&other->sk_callback_lock);
1840			if (peer_mode == SHUTDOWN_MASK)
1841				sk_wake_async(other,1,POLL_HUP);
1842			else if (peer_mode & RCV_SHUTDOWN)
1843				sk_wake_async(other,1,POLL_IN);
1844			read_unlock(&other->sk_callback_lock);
1845		}
1846		if (other)
1847			sock_put(other);
1848	}
1849	return 0;
1850}
1851
1852static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1853{
1854	struct sock *sk = sock->sk;
1855	long amount=0;
1856	int err;
1857
1858	switch(cmd)
1859	{
1860		case SIOCOUTQ:
1861			amount = atomic_read(&sk->sk_wmem_alloc);
1862			err = put_user(amount, (int __user *)arg);
1863			break;
1864		case SIOCINQ:
1865		{
1866			struct sk_buff *skb;
1867
1868			if (sk->sk_state == TCP_LISTEN) {
1869				err = -EINVAL;
1870				break;
1871			}
1872
1873			spin_lock(&sk->sk_receive_queue.lock);
1874			if (sk->sk_type == SOCK_STREAM ||
1875			    sk->sk_type == SOCK_SEQPACKET) {
1876				skb_queue_walk(&sk->sk_receive_queue, skb)
1877					amount += skb->len;
1878			} else {
1879				skb = skb_peek(&sk->sk_receive_queue);
1880				if (skb)
1881					amount=skb->len;
1882			}
1883			spin_unlock(&sk->sk_receive_queue.lock);
1884			err = put_user(amount, (int __user *)arg);
1885			break;
1886		}
1887
1888		default:
1889			err = -ENOIOCTLCMD;
1890			break;
1891	}
1892	return err;
1893}
1894
1895static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait)
1896{
1897	struct sock *sk = sock->sk;
1898	unsigned int mask;
1899
1900	poll_wait(file, sk->sk_sleep, wait);
1901	mask = 0;
1902
1903	/* exceptional events? */
1904	if (sk->sk_err)
1905		mask |= POLLERR;
1906	if (sk->sk_shutdown == SHUTDOWN_MASK)
1907		mask |= POLLHUP;
1908	if (sk->sk_shutdown & RCV_SHUTDOWN)
1909		mask |= POLLRDHUP;
1910
1911	/* readable? */
1912	if (!skb_queue_empty(&sk->sk_receive_queue) ||
1913	    (sk->sk_shutdown & RCV_SHUTDOWN))
1914		mask |= POLLIN | POLLRDNORM;
1915
1916	/* Connection-based need to check for termination and startup */
1917	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && sk->sk_state == TCP_CLOSE)
1918		mask |= POLLHUP;
1919
1920	/*
1921	 * we set writable also when the other side has shut down the
1922	 * connection. This prevents stuck sockets.
1923	 */
1924	if (unix_writable(sk))
1925		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
1926
1927	return mask;
1928}
1929
1930
1931#ifdef CONFIG_PROC_FS
1932static struct sock *unix_seq_idx(int *iter, loff_t pos)
1933{
1934	loff_t off = 0;
1935	struct sock *s;
1936
1937	for (s = first_unix_socket(iter); s; s = next_unix_socket(iter, s)) {
1938		if (off == pos)
1939			return s;
1940		++off;
1941	}
1942	return NULL;
1943}
1944
1945
1946static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
1947{
1948	spin_lock(&unix_table_lock);
1949	return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
1950}
1951
1952static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1953{
1954	++*pos;
1955
1956	if (v == (void *)1)
1957		return first_unix_socket(seq->private);
1958	return next_unix_socket(seq->private, v);
1959}
1960
1961static void unix_seq_stop(struct seq_file *seq, void *v)
1962{
1963	spin_unlock(&unix_table_lock);
1964}
1965
1966static int unix_seq_show(struct seq_file *seq, void *v)
1967{
1968
1969	if (v == (void *)1)
1970		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
1971			 "Inode Path\n");
1972	else {
1973		struct sock *s = v;
1974		struct unix_sock *u = unix_sk(s);
1975		unix_state_rlock(s);
1976
1977		seq_printf(seq, "%p: %08X %08X %08X %04X %02X %5lu",
1978			s,
1979			atomic_read(&s->sk_refcnt),
1980			0,
1981			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
1982			s->sk_type,
1983			s->sk_socket ?
1984			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
1985			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
1986			sock_i_ino(s));
1987
1988		if (u->addr) {
1989			int i, len;
1990			seq_putc(seq, ' ');
1991
1992			i = 0;
1993			len = u->addr->len - sizeof(short);
1994			if (!UNIX_ABSTRACT(s))
1995				len--;
1996			else {
1997				seq_putc(seq, '@');
1998				i++;
1999			}
2000			for ( ; i < len; i++)
2001				seq_putc(seq, u->addr->name->sun_path[i]);
2002		}
2003		unix_state_runlock(s);
2004		seq_putc(seq, '\n');
2005	}
2006
2007	return 0;
2008}
2009
2010static struct seq_operations unix_seq_ops = {
2011	.start  = unix_seq_start,
2012	.next   = unix_seq_next,
2013	.stop   = unix_seq_stop,
2014	.show   = unix_seq_show,
2015};
2016
2017
2018static int unix_seq_open(struct inode *inode, struct file *file)
2019{
2020	struct seq_file *seq;
2021	int rc = -ENOMEM;
2022	int *iter = kmalloc(sizeof(int), GFP_KERNEL);
2023
2024	if (!iter)
2025		goto out;
2026
2027	rc = seq_open(file, &unix_seq_ops);
2028	if (rc)
2029		goto out_kfree;
2030
2031	seq	     = file->private_data;
2032	seq->private = iter;
2033	*iter = 0;
2034out:
2035	return rc;
2036out_kfree:
2037	kfree(iter);
2038	goto out;
2039}
2040
2041static struct file_operations unix_seq_fops = {
2042	.owner		= THIS_MODULE,
2043	.open		= unix_seq_open,
2044	.read		= seq_read,
2045	.llseek		= seq_lseek,
2046	.release	= seq_release_private,
2047};
2048
2049#endif
2050
2051static struct net_proto_family unix_family_ops = {
2052	.family = PF_UNIX,
2053	.create = unix_create,
2054	.owner	= THIS_MODULE,
2055};
2056
2057static int __init af_unix_init(void)
2058{
2059	int rc = -1;
2060	struct sk_buff *dummy_skb;
2061
2062	if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) {
2063		printk(KERN_CRIT "%s: panic\n", __FUNCTION__);
2064		goto out;
2065	}
2066
2067	rc = proto_register(&unix_proto, 1);
2068        if (rc != 0) {
2069                printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2070		       __FUNCTION__);
2071		goto out;
2072	}
2073
2074	sock_register(&unix_family_ops);
2075#ifdef CONFIG_PROC_FS
2076	proc_net_fops_create("unix", 0, &unix_seq_fops);
2077#endif
2078	unix_sysctl_register();
2079out:
2080	return rc;
2081}
2082
2083static void __exit af_unix_exit(void)
2084{
2085	sock_unregister(PF_UNIX);
2086	unix_sysctl_unregister();
2087	proc_net_remove("unix");
2088	proto_unregister(&unix_proto);
2089}
2090
2091module_init(af_unix_init);
2092module_exit(af_unix_exit);
2093
2094MODULE_LICENSE("GPL");
2095MODULE_ALIAS_NETPROTO(PF_UNIX);
2096