af_unix.c revision 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2
1/*
2 * NET4:	Implementation of BSD Unix domain sockets.
3 *
4 * Authors:	Alan Cox, <alan.cox@linux.org>
5 *
6 *		This program is free software; you can redistribute it and/or
7 *		modify it under the terms of the GNU General Public License
8 *		as published by the Free Software Foundation; either version
9 *		2 of the License, or (at your option) any later version.
10 *
11 * Version:	$Id: af_unix.c,v 1.133 2002/02/08 03:57:19 davem Exp $
12 *
13 * Fixes:
14 *		Linus Torvalds	:	Assorted bug cures.
15 *		Niibe Yutaka	:	async I/O support.
16 *		Carsten Paeth	:	PF_UNIX check, address fixes.
17 *		Alan Cox	:	Limit size of allocated blocks.
18 *		Alan Cox	:	Fixed the stupid socketpair bug.
19 *		Alan Cox	:	BSD compatibility fine tuning.
20 *		Alan Cox	:	Fixed a bug in connect when interrupted.
21 *		Alan Cox	:	Sorted out a proper draft version of
22 *					file descriptor passing hacked up from
23 *					Mike Shaver's work.
24 *		Marty Leisner	:	Fixes to fd passing
25 *		Nick Nevin	:	recvmsg bugfix.
26 *		Alan Cox	:	Started proper garbage collector
27 *		Heiko EiBfeldt	:	Missing verify_area check
28 *		Alan Cox	:	Started POSIXisms
29 *		Andreas Schwab	:	Replace inode by dentry for proper
30 *					reference counting
31 *		Kirk Petersen	:	Made this a module
32 *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
33 *					Lots of bug fixes.
34 *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
35 *					by above two patches.
36 *	     Andrea Arcangeli	:	If possible we block in connect(2)
37 *					if the max backlog of the listen socket
38 *					is been reached. This won't break
39 *					old apps and it will avoid huge amount
40 *					of socks hashed (this for unix_gc()
41 *					performances reasons).
42 *					Security fix that limits the max
43 *					number of socks to 2*max_files and
44 *					the number of skb queueable in the
45 *					dgram receiver.
46 *		Artur Skawina   :	Hash function optimizations
47 *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
48 *	      Malcolm Beattie   :	Set peercred for socketpair
49 *	     Michal Ostrowski   :       Module initialization cleanup.
50 *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
51 *	     				the core infrastructure is doing that
52 *	     				for all net proto families now (2.5.69+)
53 *
54 *
55 * Known differences from reference BSD that was tested:
56 *
57 *	[TO FIX]
58 *	ECONNREFUSED is not returned from one end of a connected() socket to the
59 *		other the moment one end closes.
60 *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
61 *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
62 *	[NOT TO FIX]
63 *	accept() returns a path name even if the connecting socket has closed
64 *		in the meantime (BSD loses the path and gives up).
65 *	accept() returns 0 length path for an unbound connector. BSD returns 16
66 *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
67 *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
68 *	BSD af_unix apparently has connect forgetting to block properly.
69 *		(need to check this with the POSIX spec in detail)
70 *
71 * Differences from 2.0.0-11-... (ANK)
72 *	Bug fixes and improvements.
73 *		- client shutdown killed server socket.
74 *		- removed all useless cli/sti pairs.
75 *
76 *	Semantic changes/extensions.
77 *		- generic control message passing.
78 *		- SCM_CREDENTIALS control message.
79 *		- "Abstract" (not FS based) socket bindings.
80 *		  Abstract names are sequences of bytes (not zero terminated)
81 *		  started by 0, so that this name space does not intersect
82 *		  with BSD names.
83 */
84
85#include <linux/module.h>
86#include <linux/config.h>
87#include <linux/kernel.h>
88#include <linux/major.h>
89#include <linux/signal.h>
90#include <linux/sched.h>
91#include <linux/errno.h>
92#include <linux/string.h>
93#include <linux/stat.h>
94#include <linux/dcache.h>
95#include <linux/namei.h>
96#include <linux/socket.h>
97#include <linux/un.h>
98#include <linux/fcntl.h>
99#include <linux/termios.h>
100#include <linux/sockios.h>
101#include <linux/net.h>
102#include <linux/in.h>
103#include <linux/fs.h>
104#include <linux/slab.h>
105#include <asm/uaccess.h>
106#include <linux/skbuff.h>
107#include <linux/netdevice.h>
108#include <net/sock.h>
109#include <linux/tcp.h>
110#include <net/af_unix.h>
111#include <linux/proc_fs.h>
112#include <linux/seq_file.h>
113#include <net/scm.h>
114#include <linux/init.h>
115#include <linux/poll.h>
116#include <linux/smp_lock.h>
117#include <linux/rtnetlink.h>
118#include <linux/mount.h>
119#include <net/checksum.h>
120#include <linux/security.h>
121
122int sysctl_unix_max_dgram_qlen = 10;
123
124struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
125DEFINE_RWLOCK(unix_table_lock);
126static atomic_t unix_nr_socks = ATOMIC_INIT(0);
127
128#define unix_sockets_unbound	(&unix_socket_table[UNIX_HASH_SIZE])
129
130#define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
131
132/*
133 *  SMP locking strategy:
134 *    hash table is protected with rwlock unix_table_lock
135 *    each socket state is protected by separate rwlock.
136 */
137
138static inline unsigned unix_hash_fold(unsigned hash)
139{
140	hash ^= hash>>16;
141	hash ^= hash>>8;
142	return hash&(UNIX_HASH_SIZE-1);
143}
144
145#define unix_peer(sk) (unix_sk(sk)->peer)
146
147static inline int unix_our_peer(struct sock *sk, struct sock *osk)
148{
149	return unix_peer(osk) == sk;
150}
151
152static inline int unix_may_send(struct sock *sk, struct sock *osk)
153{
154	return (unix_peer(osk) == NULL || unix_our_peer(sk, osk));
155}
156
157static struct sock *unix_peer_get(struct sock *s)
158{
159	struct sock *peer;
160
161	unix_state_rlock(s);
162	peer = unix_peer(s);
163	if (peer)
164		sock_hold(peer);
165	unix_state_runlock(s);
166	return peer;
167}
168
169static inline void unix_release_addr(struct unix_address *addr)
170{
171	if (atomic_dec_and_test(&addr->refcnt))
172		kfree(addr);
173}
174
175/*
176 *	Check unix socket name:
177 *		- should be not zero length.
178 *	        - if started by not zero, should be NULL terminated (FS object)
179 *		- if started by zero, it is abstract name.
180 */
181
182static int unix_mkname(struct sockaddr_un * sunaddr, int len, unsigned *hashp)
183{
184	if (len <= sizeof(short) || len > sizeof(*sunaddr))
185		return -EINVAL;
186	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
187		return -EINVAL;
188	if (sunaddr->sun_path[0]) {
189		/*
190		 * This may look like an off by one error but it is a bit more
191		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
192		 * sun_path[108] doesnt as such exist.  However in kernel space
193		 * we are guaranteed that it is a valid memory location in our
194		 * kernel address buffer.
195		 */
196		((char *)sunaddr)[len]=0;
197		len = strlen(sunaddr->sun_path)+1+sizeof(short);
198		return len;
199	}
200
201	*hashp = unix_hash_fold(csum_partial((char*)sunaddr, len, 0));
202	return len;
203}
204
205static void __unix_remove_socket(struct sock *sk)
206{
207	sk_del_node_init(sk);
208}
209
210static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
211{
212	BUG_TRAP(sk_unhashed(sk));
213	sk_add_node(sk, list);
214}
215
216static inline void unix_remove_socket(struct sock *sk)
217{
218	write_lock(&unix_table_lock);
219	__unix_remove_socket(sk);
220	write_unlock(&unix_table_lock);
221}
222
223static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
224{
225	write_lock(&unix_table_lock);
226	__unix_insert_socket(list, sk);
227	write_unlock(&unix_table_lock);
228}
229
230static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
231					      int len, int type, unsigned hash)
232{
233	struct sock *s;
234	struct hlist_node *node;
235
236	sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
237		struct unix_sock *u = unix_sk(s);
238
239		if (u->addr->len == len &&
240		    !memcmp(u->addr->name, sunname, len))
241			goto found;
242	}
243	s = NULL;
244found:
245	return s;
246}
247
248static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname,
249						   int len, int type,
250						   unsigned hash)
251{
252	struct sock *s;
253
254	read_lock(&unix_table_lock);
255	s = __unix_find_socket_byname(sunname, len, type, hash);
256	if (s)
257		sock_hold(s);
258	read_unlock(&unix_table_lock);
259	return s;
260}
261
262static struct sock *unix_find_socket_byinode(struct inode *i)
263{
264	struct sock *s;
265	struct hlist_node *node;
266
267	read_lock(&unix_table_lock);
268	sk_for_each(s, node,
269		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
270		struct dentry *dentry = unix_sk(s)->dentry;
271
272		if(dentry && dentry->d_inode == i)
273		{
274			sock_hold(s);
275			goto found;
276		}
277	}
278	s = NULL;
279found:
280	read_unlock(&unix_table_lock);
281	return s;
282}
283
284static inline int unix_writable(struct sock *sk)
285{
286	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
287}
288
289static void unix_write_space(struct sock *sk)
290{
291	read_lock(&sk->sk_callback_lock);
292	if (unix_writable(sk)) {
293		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
294			wake_up_interruptible(sk->sk_sleep);
295		sk_wake_async(sk, 2, POLL_OUT);
296	}
297	read_unlock(&sk->sk_callback_lock);
298}
299
300/* When dgram socket disconnects (or changes its peer), we clear its receive
301 * queue of packets arrived from previous peer. First, it allows to do
302 * flow control based only on wmem_alloc; second, sk connected to peer
303 * may receive messages only from that peer. */
304static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
305{
306	if (skb_queue_len(&sk->sk_receive_queue)) {
307		skb_queue_purge(&sk->sk_receive_queue);
308		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
309
310		/* If one link of bidirectional dgram pipe is disconnected,
311		 * we signal error. Messages are lost. Do not make this,
312		 * when peer was not connected to us.
313		 */
314		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
315			other->sk_err = ECONNRESET;
316			other->sk_error_report(other);
317		}
318	}
319}
320
321static void unix_sock_destructor(struct sock *sk)
322{
323	struct unix_sock *u = unix_sk(sk);
324
325	skb_queue_purge(&sk->sk_receive_queue);
326
327	BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
328	BUG_TRAP(sk_unhashed(sk));
329	BUG_TRAP(!sk->sk_socket);
330	if (!sock_flag(sk, SOCK_DEAD)) {
331		printk("Attempt to release alive unix socket: %p\n", sk);
332		return;
333	}
334
335	if (u->addr)
336		unix_release_addr(u->addr);
337
338	atomic_dec(&unix_nr_socks);
339#ifdef UNIX_REFCNT_DEBUG
340	printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, atomic_read(&unix_nr_socks));
341#endif
342}
343
344static int unix_release_sock (struct sock *sk, int embrion)
345{
346	struct unix_sock *u = unix_sk(sk);
347	struct dentry *dentry;
348	struct vfsmount *mnt;
349	struct sock *skpair;
350	struct sk_buff *skb;
351	int state;
352
353	unix_remove_socket(sk);
354
355	/* Clear state */
356	unix_state_wlock(sk);
357	sock_orphan(sk);
358	sk->sk_shutdown = SHUTDOWN_MASK;
359	dentry	     = u->dentry;
360	u->dentry    = NULL;
361	mnt	     = u->mnt;
362	u->mnt	     = NULL;
363	state = sk->sk_state;
364	sk->sk_state = TCP_CLOSE;
365	unix_state_wunlock(sk);
366
367	wake_up_interruptible_all(&u->peer_wait);
368
369	skpair=unix_peer(sk);
370
371	if (skpair!=NULL) {
372		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
373			unix_state_wlock(skpair);
374			/* No more writes */
375			skpair->sk_shutdown = SHUTDOWN_MASK;
376			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
377				skpair->sk_err = ECONNRESET;
378			unix_state_wunlock(skpair);
379			skpair->sk_state_change(skpair);
380			read_lock(&skpair->sk_callback_lock);
381			sk_wake_async(skpair,1,POLL_HUP);
382			read_unlock(&skpair->sk_callback_lock);
383		}
384		sock_put(skpair); /* It may now die */
385		unix_peer(sk) = NULL;
386	}
387
388	/* Try to flush out this socket. Throw out buffers at least */
389
390	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
391		if (state==TCP_LISTEN)
392			unix_release_sock(skb->sk, 1);
393		/* passed fds are erased in the kfree_skb hook	      */
394		kfree_skb(skb);
395	}
396
397	if (dentry) {
398		dput(dentry);
399		mntput(mnt);
400	}
401
402	sock_put(sk);
403
404	/* ---- Socket is dead now and most probably destroyed ---- */
405
406	/*
407	 * Fixme: BSD difference: In BSD all sockets connected to use get
408	 *	  ECONNRESET and we die on the spot. In Linux we behave
409	 *	  like files and pipes do and wait for the last
410	 *	  dereference.
411	 *
412	 * Can't we simply set sock->err?
413	 *
414	 *	  What the above comment does talk about? --ANK(980817)
415	 */
416
417	if (atomic_read(&unix_tot_inflight))
418		unix_gc();		/* Garbage collect fds */
419
420	return 0;
421}
422
423static int unix_listen(struct socket *sock, int backlog)
424{
425	int err;
426	struct sock *sk = sock->sk;
427	struct unix_sock *u = unix_sk(sk);
428
429	err = -EOPNOTSUPP;
430	if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
431		goto out;			/* Only stream/seqpacket sockets accept */
432	err = -EINVAL;
433	if (!u->addr)
434		goto out;			/* No listens on an unbound socket */
435	unix_state_wlock(sk);
436	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
437		goto out_unlock;
438	if (backlog > sk->sk_max_ack_backlog)
439		wake_up_interruptible_all(&u->peer_wait);
440	sk->sk_max_ack_backlog	= backlog;
441	sk->sk_state		= TCP_LISTEN;
442	/* set credentials so connect can copy them */
443	sk->sk_peercred.pid	= current->tgid;
444	sk->sk_peercred.uid	= current->euid;
445	sk->sk_peercred.gid	= current->egid;
446	err = 0;
447
448out_unlock:
449	unix_state_wunlock(sk);
450out:
451	return err;
452}
453
454static int unix_release(struct socket *);
455static int unix_bind(struct socket *, struct sockaddr *, int);
456static int unix_stream_connect(struct socket *, struct sockaddr *,
457			       int addr_len, int flags);
458static int unix_socketpair(struct socket *, struct socket *);
459static int unix_accept(struct socket *, struct socket *, int);
460static int unix_getname(struct socket *, struct sockaddr *, int *, int);
461static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
462static int unix_ioctl(struct socket *, unsigned int, unsigned long);
463static int unix_shutdown(struct socket *, int);
464static int unix_stream_sendmsg(struct kiocb *, struct socket *,
465			       struct msghdr *, size_t);
466static int unix_stream_recvmsg(struct kiocb *, struct socket *,
467			       struct msghdr *, size_t, int);
468static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
469			      struct msghdr *, size_t);
470static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
471			      struct msghdr *, size_t, int);
472static int unix_dgram_connect(struct socket *, struct sockaddr *,
473			      int, int);
474static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
475				  struct msghdr *, size_t);
476
477static struct proto_ops unix_stream_ops = {
478	.family =	PF_UNIX,
479	.owner =	THIS_MODULE,
480	.release =	unix_release,
481	.bind =		unix_bind,
482	.connect =	unix_stream_connect,
483	.socketpair =	unix_socketpair,
484	.accept =	unix_accept,
485	.getname =	unix_getname,
486	.poll =		unix_poll,
487	.ioctl =	unix_ioctl,
488	.listen =	unix_listen,
489	.shutdown =	unix_shutdown,
490	.setsockopt =	sock_no_setsockopt,
491	.getsockopt =	sock_no_getsockopt,
492	.sendmsg =	unix_stream_sendmsg,
493	.recvmsg =	unix_stream_recvmsg,
494	.mmap =		sock_no_mmap,
495	.sendpage =	sock_no_sendpage,
496};
497
498static struct proto_ops unix_dgram_ops = {
499	.family =	PF_UNIX,
500	.owner =	THIS_MODULE,
501	.release =	unix_release,
502	.bind =		unix_bind,
503	.connect =	unix_dgram_connect,
504	.socketpair =	unix_socketpair,
505	.accept =	sock_no_accept,
506	.getname =	unix_getname,
507	.poll =		datagram_poll,
508	.ioctl =	unix_ioctl,
509	.listen =	sock_no_listen,
510	.shutdown =	unix_shutdown,
511	.setsockopt =	sock_no_setsockopt,
512	.getsockopt =	sock_no_getsockopt,
513	.sendmsg =	unix_dgram_sendmsg,
514	.recvmsg =	unix_dgram_recvmsg,
515	.mmap =		sock_no_mmap,
516	.sendpage =	sock_no_sendpage,
517};
518
519static struct proto_ops unix_seqpacket_ops = {
520	.family =	PF_UNIX,
521	.owner =	THIS_MODULE,
522	.release =	unix_release,
523	.bind =		unix_bind,
524	.connect =	unix_stream_connect,
525	.socketpair =	unix_socketpair,
526	.accept =	unix_accept,
527	.getname =	unix_getname,
528	.poll =		datagram_poll,
529	.ioctl =	unix_ioctl,
530	.listen =	unix_listen,
531	.shutdown =	unix_shutdown,
532	.setsockopt =	sock_no_setsockopt,
533	.getsockopt =	sock_no_getsockopt,
534	.sendmsg =	unix_seqpacket_sendmsg,
535	.recvmsg =	unix_dgram_recvmsg,
536	.mmap =		sock_no_mmap,
537	.sendpage =	sock_no_sendpage,
538};
539
540static struct proto unix_proto = {
541	.name	  = "UNIX",
542	.owner	  = THIS_MODULE,
543	.obj_size = sizeof(struct unix_sock),
544};
545
546static struct sock * unix_create1(struct socket *sock)
547{
548	struct sock *sk = NULL;
549	struct unix_sock *u;
550
551	if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files)
552		goto out;
553
554	sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1);
555	if (!sk)
556		goto out;
557
558	atomic_inc(&unix_nr_socks);
559
560	sock_init_data(sock,sk);
561
562	sk->sk_write_space	= unix_write_space;
563	sk->sk_max_ack_backlog	= sysctl_unix_max_dgram_qlen;
564	sk->sk_destruct		= unix_sock_destructor;
565	u	  = unix_sk(sk);
566	u->dentry = NULL;
567	u->mnt	  = NULL;
568	rwlock_init(&u->lock);
569	atomic_set(&u->inflight, sock ? 0 : -1);
570	init_MUTEX(&u->readsem); /* single task reading lock */
571	init_waitqueue_head(&u->peer_wait);
572	unix_insert_socket(unix_sockets_unbound, sk);
573out:
574	return sk;
575}
576
577static int unix_create(struct socket *sock, int protocol)
578{
579	if (protocol && protocol != PF_UNIX)
580		return -EPROTONOSUPPORT;
581
582	sock->state = SS_UNCONNECTED;
583
584	switch (sock->type) {
585	case SOCK_STREAM:
586		sock->ops = &unix_stream_ops;
587		break;
588		/*
589		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
590		 *	nothing uses it.
591		 */
592	case SOCK_RAW:
593		sock->type=SOCK_DGRAM;
594	case SOCK_DGRAM:
595		sock->ops = &unix_dgram_ops;
596		break;
597	case SOCK_SEQPACKET:
598		sock->ops = &unix_seqpacket_ops;
599		break;
600	default:
601		return -ESOCKTNOSUPPORT;
602	}
603
604	return unix_create1(sock) ? 0 : -ENOMEM;
605}
606
607static int unix_release(struct socket *sock)
608{
609	struct sock *sk = sock->sk;
610
611	if (!sk)
612		return 0;
613
614	sock->sk = NULL;
615
616	return unix_release_sock (sk, 0);
617}
618
619static int unix_autobind(struct socket *sock)
620{
621	struct sock *sk = sock->sk;
622	struct unix_sock *u = unix_sk(sk);
623	static u32 ordernum = 1;
624	struct unix_address * addr;
625	int err;
626
627	down(&u->readsem);
628
629	err = 0;
630	if (u->addr)
631		goto out;
632
633	err = -ENOMEM;
634	addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
635	if (!addr)
636		goto out;
637
638	memset(addr, 0, sizeof(*addr) + sizeof(short) + 16);
639	addr->name->sun_family = AF_UNIX;
640	atomic_set(&addr->refcnt, 1);
641
642retry:
643	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
644	addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
645
646	write_lock(&unix_table_lock);
647	ordernum = (ordernum+1)&0xFFFFF;
648
649	if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
650				      addr->hash)) {
651		write_unlock(&unix_table_lock);
652		/* Sanity yield. It is unusual case, but yet... */
653		if (!(ordernum&0xFF))
654			yield();
655		goto retry;
656	}
657	addr->hash ^= sk->sk_type;
658
659	__unix_remove_socket(sk);
660	u->addr = addr;
661	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
662	write_unlock(&unix_table_lock);
663	err = 0;
664
665out:	up(&u->readsem);
666	return err;
667}
668
669static struct sock *unix_find_other(struct sockaddr_un *sunname, int len,
670				    int type, unsigned hash, int *error)
671{
672	struct sock *u;
673	struct nameidata nd;
674	int err = 0;
675
676	if (sunname->sun_path[0]) {
677		err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
678		if (err)
679			goto fail;
680		err = permission(nd.dentry->d_inode,MAY_WRITE, &nd);
681		if (err)
682			goto put_fail;
683
684		err = -ECONNREFUSED;
685		if (!S_ISSOCK(nd.dentry->d_inode->i_mode))
686			goto put_fail;
687		u=unix_find_socket_byinode(nd.dentry->d_inode);
688		if (!u)
689			goto put_fail;
690
691		if (u->sk_type == type)
692			touch_atime(nd.mnt, nd.dentry);
693
694		path_release(&nd);
695
696		err=-EPROTOTYPE;
697		if (u->sk_type != type) {
698			sock_put(u);
699			goto fail;
700		}
701	} else {
702		err = -ECONNREFUSED;
703		u=unix_find_socket_byname(sunname, len, type, hash);
704		if (u) {
705			struct dentry *dentry;
706			dentry = unix_sk(u)->dentry;
707			if (dentry)
708				touch_atime(unix_sk(u)->mnt, dentry);
709		} else
710			goto fail;
711	}
712	return u;
713
714put_fail:
715	path_release(&nd);
716fail:
717	*error=err;
718	return NULL;
719}
720
721
722static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
723{
724	struct sock *sk = sock->sk;
725	struct unix_sock *u = unix_sk(sk);
726	struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
727	struct dentry * dentry = NULL;
728	struct nameidata nd;
729	int err;
730	unsigned hash;
731	struct unix_address *addr;
732	struct hlist_head *list;
733
734	err = -EINVAL;
735	if (sunaddr->sun_family != AF_UNIX)
736		goto out;
737
738	if (addr_len==sizeof(short)) {
739		err = unix_autobind(sock);
740		goto out;
741	}
742
743	err = unix_mkname(sunaddr, addr_len, &hash);
744	if (err < 0)
745		goto out;
746	addr_len = err;
747
748	down(&u->readsem);
749
750	err = -EINVAL;
751	if (u->addr)
752		goto out_up;
753
754	err = -ENOMEM;
755	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
756	if (!addr)
757		goto out_up;
758
759	memcpy(addr->name, sunaddr, addr_len);
760	addr->len = addr_len;
761	addr->hash = hash ^ sk->sk_type;
762	atomic_set(&addr->refcnt, 1);
763
764	if (sunaddr->sun_path[0]) {
765		unsigned int mode;
766		err = 0;
767		/*
768		 * Get the parent directory, calculate the hash for last
769		 * component.
770		 */
771		err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd);
772		if (err)
773			goto out_mknod_parent;
774		/*
775		 * Yucky last component or no last component at all?
776		 * (foo/., foo/.., /////)
777		 */
778		err = -EEXIST;
779		if (nd.last_type != LAST_NORM)
780			goto out_mknod;
781		/*
782		 * Lock the directory.
783		 */
784		down(&nd.dentry->d_inode->i_sem);
785		/*
786		 * Do the final lookup.
787		 */
788		dentry = lookup_hash(&nd.last, nd.dentry);
789		err = PTR_ERR(dentry);
790		if (IS_ERR(dentry))
791			goto out_mknod_unlock;
792		err = -ENOENT;
793		/*
794		 * Special case - lookup gave negative, but... we had foo/bar/
795		 * From the vfs_mknod() POV we just have a negative dentry -
796		 * all is fine. Let's be bastards - you had / on the end, you've
797		 * been asking for (non-existent) directory. -ENOENT for you.
798		 */
799		if (nd.last.name[nd.last.len] && !dentry->d_inode)
800			goto out_mknod_dput;
801		/*
802		 * All right, let's create it.
803		 */
804		mode = S_IFSOCK |
805		       (SOCK_INODE(sock)->i_mode & ~current->fs->umask);
806		err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
807		if (err)
808			goto out_mknod_dput;
809		up(&nd.dentry->d_inode->i_sem);
810		dput(nd.dentry);
811		nd.dentry = dentry;
812
813		addr->hash = UNIX_HASH_SIZE;
814	}
815
816	write_lock(&unix_table_lock);
817
818	if (!sunaddr->sun_path[0]) {
819		err = -EADDRINUSE;
820		if (__unix_find_socket_byname(sunaddr, addr_len,
821					      sk->sk_type, hash)) {
822			unix_release_addr(addr);
823			goto out_unlock;
824		}
825
826		list = &unix_socket_table[addr->hash];
827	} else {
828		list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
829		u->dentry = nd.dentry;
830		u->mnt    = nd.mnt;
831	}
832
833	err = 0;
834	__unix_remove_socket(sk);
835	u->addr = addr;
836	__unix_insert_socket(list, sk);
837
838out_unlock:
839	write_unlock(&unix_table_lock);
840out_up:
841	up(&u->readsem);
842out:
843	return err;
844
845out_mknod_dput:
846	dput(dentry);
847out_mknod_unlock:
848	up(&nd.dentry->d_inode->i_sem);
849out_mknod:
850	path_release(&nd);
851out_mknod_parent:
852	if (err==-EEXIST)
853		err=-EADDRINUSE;
854	unix_release_addr(addr);
855	goto out_up;
856}
857
858static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
859			      int alen, int flags)
860{
861	struct sock *sk = sock->sk;
862	struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr;
863	struct sock *other;
864	unsigned hash;
865	int err;
866
867	if (addr->sa_family != AF_UNSPEC) {
868		err = unix_mkname(sunaddr, alen, &hash);
869		if (err < 0)
870			goto out;
871		alen = err;
872
873		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
874		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
875			goto out;
876
877		other=unix_find_other(sunaddr, alen, sock->type, hash, &err);
878		if (!other)
879			goto out;
880
881		unix_state_wlock(sk);
882
883		err = -EPERM;
884		if (!unix_may_send(sk, other))
885			goto out_unlock;
886
887		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
888		if (err)
889			goto out_unlock;
890
891	} else {
892		/*
893		 *	1003.1g breaking connected state with AF_UNSPEC
894		 */
895		other = NULL;
896		unix_state_wlock(sk);
897	}
898
899	/*
900	 * If it was connected, reconnect.
901	 */
902	if (unix_peer(sk)) {
903		struct sock *old_peer = unix_peer(sk);
904		unix_peer(sk)=other;
905		unix_state_wunlock(sk);
906
907		if (other != old_peer)
908			unix_dgram_disconnected(sk, old_peer);
909		sock_put(old_peer);
910	} else {
911		unix_peer(sk)=other;
912		unix_state_wunlock(sk);
913	}
914 	return 0;
915
916out_unlock:
917	unix_state_wunlock(sk);
918	sock_put(other);
919out:
920	return err;
921}
922
923static long unix_wait_for_peer(struct sock *other, long timeo)
924{
925	struct unix_sock *u = unix_sk(other);
926	int sched;
927	DEFINE_WAIT(wait);
928
929	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
930
931	sched = !sock_flag(other, SOCK_DEAD) &&
932		!(other->sk_shutdown & RCV_SHUTDOWN) &&
933		(skb_queue_len(&other->sk_receive_queue) >
934		 other->sk_max_ack_backlog);
935
936	unix_state_runlock(other);
937
938	if (sched)
939		timeo = schedule_timeout(timeo);
940
941	finish_wait(&u->peer_wait, &wait);
942	return timeo;
943}
944
945static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
946			       int addr_len, int flags)
947{
948	struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
949	struct sock *sk = sock->sk;
950	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
951	struct sock *newsk = NULL;
952	struct sock *other = NULL;
953	struct sk_buff *skb = NULL;
954	unsigned hash;
955	int st;
956	int err;
957	long timeo;
958
959	err = unix_mkname(sunaddr, addr_len, &hash);
960	if (err < 0)
961		goto out;
962	addr_len = err;
963
964	if (test_bit(SOCK_PASSCRED, &sock->flags)
965		&& !u->addr && (err = unix_autobind(sock)) != 0)
966		goto out;
967
968	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
969
970	/* First of all allocate resources.
971	   If we will make it after state is locked,
972	   we will have to recheck all again in any case.
973	 */
974
975	err = -ENOMEM;
976
977	/* create new sock for complete connection */
978	newsk = unix_create1(NULL);
979	if (newsk == NULL)
980		goto out;
981
982	/* Allocate skb for sending to listening sock */
983	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
984	if (skb == NULL)
985		goto out;
986
987restart:
988	/*  Find listening sock. */
989	other = unix_find_other(sunaddr, addr_len, sk->sk_type, hash, &err);
990	if (!other)
991		goto out;
992
993	/* Latch state of peer */
994	unix_state_rlock(other);
995
996	/* Apparently VFS overslept socket death. Retry. */
997	if (sock_flag(other, SOCK_DEAD)) {
998		unix_state_runlock(other);
999		sock_put(other);
1000		goto restart;
1001	}
1002
1003	err = -ECONNREFUSED;
1004	if (other->sk_state != TCP_LISTEN)
1005		goto out_unlock;
1006
1007	if (skb_queue_len(&other->sk_receive_queue) >
1008	    other->sk_max_ack_backlog) {
1009		err = -EAGAIN;
1010		if (!timeo)
1011			goto out_unlock;
1012
1013		timeo = unix_wait_for_peer(other, timeo);
1014
1015		err = sock_intr_errno(timeo);
1016		if (signal_pending(current))
1017			goto out;
1018		sock_put(other);
1019		goto restart;
1020        }
1021
1022	/* Latch our state.
1023
1024	   It is tricky place. We need to grab write lock and cannot
1025	   drop lock on peer. It is dangerous because deadlock is
1026	   possible. Connect to self case and simultaneous
1027	   attempt to connect are eliminated by checking socket
1028	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1029	   check this before attempt to grab lock.
1030
1031	   Well, and we have to recheck the state after socket locked.
1032	 */
1033	st = sk->sk_state;
1034
1035	switch (st) {
1036	case TCP_CLOSE:
1037		/* This is ok... continue with connect */
1038		break;
1039	case TCP_ESTABLISHED:
1040		/* Socket is already connected */
1041		err = -EISCONN;
1042		goto out_unlock;
1043	default:
1044		err = -EINVAL;
1045		goto out_unlock;
1046	}
1047
1048	unix_state_wlock(sk);
1049
1050	if (sk->sk_state != st) {
1051		unix_state_wunlock(sk);
1052		unix_state_runlock(other);
1053		sock_put(other);
1054		goto restart;
1055	}
1056
1057	err = security_unix_stream_connect(sock, other->sk_socket, newsk);
1058	if (err) {
1059		unix_state_wunlock(sk);
1060		goto out_unlock;
1061	}
1062
1063	/* The way is open! Fastly set all the necessary fields... */
1064
1065	sock_hold(sk);
1066	unix_peer(newsk)	= sk;
1067	newsk->sk_state		= TCP_ESTABLISHED;
1068	newsk->sk_type		= sk->sk_type;
1069	newsk->sk_peercred.pid	= current->tgid;
1070	newsk->sk_peercred.uid	= current->euid;
1071	newsk->sk_peercred.gid	= current->egid;
1072	newu = unix_sk(newsk);
1073	newsk->sk_sleep		= &newu->peer_wait;
1074	otheru = unix_sk(other);
1075
1076	/* copy address information from listening to new sock*/
1077	if (otheru->addr) {
1078		atomic_inc(&otheru->addr->refcnt);
1079		newu->addr = otheru->addr;
1080	}
1081	if (otheru->dentry) {
1082		newu->dentry	= dget(otheru->dentry);
1083		newu->mnt	= mntget(otheru->mnt);
1084	}
1085
1086	/* Set credentials */
1087	sk->sk_peercred = other->sk_peercred;
1088
1089	sock_hold(newsk);
1090	unix_peer(sk)	= newsk;
1091	sock->state	= SS_CONNECTED;
1092	sk->sk_state	= TCP_ESTABLISHED;
1093
1094	unix_state_wunlock(sk);
1095
1096	/* take ten and and send info to listening sock */
1097	spin_lock(&other->sk_receive_queue.lock);
1098	__skb_queue_tail(&other->sk_receive_queue, skb);
1099	/* Undo artificially decreased inflight after embrion
1100	 * is installed to listening socket. */
1101	atomic_inc(&newu->inflight);
1102	spin_unlock(&other->sk_receive_queue.lock);
1103	unix_state_runlock(other);
1104	other->sk_data_ready(other, 0);
1105	sock_put(other);
1106	return 0;
1107
1108out_unlock:
1109	if (other)
1110		unix_state_runlock(other);
1111
1112out:
1113	if (skb)
1114		kfree_skb(skb);
1115	if (newsk)
1116		unix_release_sock(newsk, 0);
1117	if (other)
1118		sock_put(other);
1119	return err;
1120}
1121
1122static int unix_socketpair(struct socket *socka, struct socket *sockb)
1123{
1124	struct sock *ska=socka->sk, *skb = sockb->sk;
1125
1126	/* Join our sockets back to back */
1127	sock_hold(ska);
1128	sock_hold(skb);
1129	unix_peer(ska)=skb;
1130	unix_peer(skb)=ska;
1131	ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid;
1132	ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid;
1133	ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid;
1134
1135	if (ska->sk_type != SOCK_DGRAM) {
1136		ska->sk_state = TCP_ESTABLISHED;
1137		skb->sk_state = TCP_ESTABLISHED;
1138		socka->state  = SS_CONNECTED;
1139		sockb->state  = SS_CONNECTED;
1140	}
1141	return 0;
1142}
1143
1144static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1145{
1146	struct sock *sk = sock->sk;
1147	struct sock *tsk;
1148	struct sk_buff *skb;
1149	int err;
1150
1151	err = -EOPNOTSUPP;
1152	if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
1153		goto out;
1154
1155	err = -EINVAL;
1156	if (sk->sk_state != TCP_LISTEN)
1157		goto out;
1158
1159	/* If socket state is TCP_LISTEN it cannot change (for now...),
1160	 * so that no locks are necessary.
1161	 */
1162
1163	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1164	if (!skb) {
1165		/* This means receive shutdown. */
1166		if (err == 0)
1167			err = -EINVAL;
1168		goto out;
1169	}
1170
1171	tsk = skb->sk;
1172	skb_free_datagram(sk, skb);
1173	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1174
1175	/* attach accepted sock to socket */
1176	unix_state_wlock(tsk);
1177	newsock->state = SS_CONNECTED;
1178	sock_graft(tsk, newsock);
1179	unix_state_wunlock(tsk);
1180	return 0;
1181
1182out:
1183	return err;
1184}
1185
1186
1187static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1188{
1189	struct sock *sk = sock->sk;
1190	struct unix_sock *u;
1191	struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
1192	int err = 0;
1193
1194	if (peer) {
1195		sk = unix_peer_get(sk);
1196
1197		err = -ENOTCONN;
1198		if (!sk)
1199			goto out;
1200		err = 0;
1201	} else {
1202		sock_hold(sk);
1203	}
1204
1205	u = unix_sk(sk);
1206	unix_state_rlock(sk);
1207	if (!u->addr) {
1208		sunaddr->sun_family = AF_UNIX;
1209		sunaddr->sun_path[0] = 0;
1210		*uaddr_len = sizeof(short);
1211	} else {
1212		struct unix_address *addr = u->addr;
1213
1214		*uaddr_len = addr->len;
1215		memcpy(sunaddr, addr->name, *uaddr_len);
1216	}
1217	unix_state_runlock(sk);
1218	sock_put(sk);
1219out:
1220	return err;
1221}
1222
1223static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1224{
1225	int i;
1226
1227	scm->fp = UNIXCB(skb).fp;
1228	skb->destructor = sock_wfree;
1229	UNIXCB(skb).fp = NULL;
1230
1231	for (i=scm->fp->count-1; i>=0; i--)
1232		unix_notinflight(scm->fp->fp[i]);
1233}
1234
1235static void unix_destruct_fds(struct sk_buff *skb)
1236{
1237	struct scm_cookie scm;
1238	memset(&scm, 0, sizeof(scm));
1239	unix_detach_fds(&scm, skb);
1240
1241	/* Alas, it calls VFS */
1242	/* So fscking what? fput() had been SMP-safe since the last Summer */
1243	scm_destroy(&scm);
1244	sock_wfree(skb);
1245}
1246
1247static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1248{
1249	int i;
1250	for (i=scm->fp->count-1; i>=0; i--)
1251		unix_inflight(scm->fp->fp[i]);
1252	UNIXCB(skb).fp = scm->fp;
1253	skb->destructor = unix_destruct_fds;
1254	scm->fp = NULL;
1255}
1256
1257/*
1258 *	Send AF_UNIX data.
1259 */
1260
1261static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1262			      struct msghdr *msg, size_t len)
1263{
1264	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1265	struct sock *sk = sock->sk;
1266	struct unix_sock *u = unix_sk(sk);
1267	struct sockaddr_un *sunaddr=msg->msg_name;
1268	struct sock *other = NULL;
1269	int namelen = 0; /* fake GCC */
1270	int err;
1271	unsigned hash;
1272	struct sk_buff *skb;
1273	long timeo;
1274	struct scm_cookie tmp_scm;
1275
1276	if (NULL == siocb->scm)
1277		siocb->scm = &tmp_scm;
1278	err = scm_send(sock, msg, siocb->scm);
1279	if (err < 0)
1280		return err;
1281
1282	err = -EOPNOTSUPP;
1283	if (msg->msg_flags&MSG_OOB)
1284		goto out;
1285
1286	if (msg->msg_namelen) {
1287		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1288		if (err < 0)
1289			goto out;
1290		namelen = err;
1291	} else {
1292		sunaddr = NULL;
1293		err = -ENOTCONN;
1294		other = unix_peer_get(sk);
1295		if (!other)
1296			goto out;
1297	}
1298
1299	if (test_bit(SOCK_PASSCRED, &sock->flags)
1300		&& !u->addr && (err = unix_autobind(sock)) != 0)
1301		goto out;
1302
1303	err = -EMSGSIZE;
1304	if (len > sk->sk_sndbuf - 32)
1305		goto out;
1306
1307	skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err);
1308	if (skb==NULL)
1309		goto out;
1310
1311	memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1312	if (siocb->scm->fp)
1313		unix_attach_fds(siocb->scm, skb);
1314
1315	skb->h.raw = skb->data;
1316	err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
1317	if (err)
1318		goto out_free;
1319
1320	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1321
1322restart:
1323	if (!other) {
1324		err = -ECONNRESET;
1325		if (sunaddr == NULL)
1326			goto out_free;
1327
1328		other = unix_find_other(sunaddr, namelen, sk->sk_type,
1329					hash, &err);
1330		if (other==NULL)
1331			goto out_free;
1332	}
1333
1334	unix_state_rlock(other);
1335	err = -EPERM;
1336	if (!unix_may_send(sk, other))
1337		goto out_unlock;
1338
1339	if (sock_flag(other, SOCK_DEAD)) {
1340		/*
1341		 *	Check with 1003.1g - what should
1342		 *	datagram error
1343		 */
1344		unix_state_runlock(other);
1345		sock_put(other);
1346
1347		err = 0;
1348		unix_state_wlock(sk);
1349		if (unix_peer(sk) == other) {
1350			unix_peer(sk)=NULL;
1351			unix_state_wunlock(sk);
1352
1353			unix_dgram_disconnected(sk, other);
1354			sock_put(other);
1355			err = -ECONNREFUSED;
1356		} else {
1357			unix_state_wunlock(sk);
1358		}
1359
1360		other = NULL;
1361		if (err)
1362			goto out_free;
1363		goto restart;
1364	}
1365
1366	err = -EPIPE;
1367	if (other->sk_shutdown & RCV_SHUTDOWN)
1368		goto out_unlock;
1369
1370	if (sk->sk_type != SOCK_SEQPACKET) {
1371		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1372		if (err)
1373			goto out_unlock;
1374	}
1375
1376	if (unix_peer(other) != sk &&
1377	    (skb_queue_len(&other->sk_receive_queue) >
1378	     other->sk_max_ack_backlog)) {
1379		if (!timeo) {
1380			err = -EAGAIN;
1381			goto out_unlock;
1382		}
1383
1384		timeo = unix_wait_for_peer(other, timeo);
1385
1386		err = sock_intr_errno(timeo);
1387		if (signal_pending(current))
1388			goto out_free;
1389
1390		goto restart;
1391	}
1392
1393	skb_queue_tail(&other->sk_receive_queue, skb);
1394	unix_state_runlock(other);
1395	other->sk_data_ready(other, len);
1396	sock_put(other);
1397	scm_destroy(siocb->scm);
1398	return len;
1399
1400out_unlock:
1401	unix_state_runlock(other);
1402out_free:
1403	kfree_skb(skb);
1404out:
1405	if (other)
1406		sock_put(other);
1407	scm_destroy(siocb->scm);
1408	return err;
1409}
1410
1411
1412static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1413			       struct msghdr *msg, size_t len)
1414{
1415	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1416	struct sock *sk = sock->sk;
1417	struct sock *other = NULL;
1418	struct sockaddr_un *sunaddr=msg->msg_name;
1419	int err,size;
1420	struct sk_buff *skb;
1421	int sent=0;
1422	struct scm_cookie tmp_scm;
1423
1424	if (NULL == siocb->scm)
1425		siocb->scm = &tmp_scm;
1426	err = scm_send(sock, msg, siocb->scm);
1427	if (err < 0)
1428		return err;
1429
1430	err = -EOPNOTSUPP;
1431	if (msg->msg_flags&MSG_OOB)
1432		goto out_err;
1433
1434	if (msg->msg_namelen) {
1435		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1436		goto out_err;
1437	} else {
1438		sunaddr = NULL;
1439		err = -ENOTCONN;
1440		other = unix_peer_get(sk);
1441		if (!other)
1442			goto out_err;
1443	}
1444
1445	if (sk->sk_shutdown & SEND_SHUTDOWN)
1446		goto pipe_err;
1447
1448	while(sent < len)
1449	{
1450		/*
1451		 *	Optimisation for the fact that under 0.01% of X messages typically
1452		 *	need breaking up.
1453		 */
1454
1455		size=len-sent;
1456
1457		/* Keep two messages in the pipe so it schedules better */
1458		if (size > sk->sk_sndbuf / 2 - 64)
1459			size = sk->sk_sndbuf / 2 - 64;
1460
1461		if (size > SKB_MAX_ALLOC)
1462			size = SKB_MAX_ALLOC;
1463
1464		/*
1465		 *	Grab a buffer
1466		 */
1467
1468		skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err);
1469
1470		if (skb==NULL)
1471			goto out_err;
1472
1473		/*
1474		 *	If you pass two values to the sock_alloc_send_skb
1475		 *	it tries to grab the large buffer with GFP_NOFS
1476		 *	(which can fail easily), and if it fails grab the
1477		 *	fallback size buffer which is under a page and will
1478		 *	succeed. [Alan]
1479		 */
1480		size = min_t(int, size, skb_tailroom(skb));
1481
1482		memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1483		if (siocb->scm->fp)
1484			unix_attach_fds(siocb->scm, skb);
1485
1486		if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
1487			kfree_skb(skb);
1488			goto out_err;
1489		}
1490
1491		unix_state_rlock(other);
1492
1493		if (sock_flag(other, SOCK_DEAD) ||
1494		    (other->sk_shutdown & RCV_SHUTDOWN))
1495			goto pipe_err_free;
1496
1497		skb_queue_tail(&other->sk_receive_queue, skb);
1498		unix_state_runlock(other);
1499		other->sk_data_ready(other, size);
1500		sent+=size;
1501	}
1502	sock_put(other);
1503
1504	scm_destroy(siocb->scm);
1505	siocb->scm = NULL;
1506
1507	return sent;
1508
1509pipe_err_free:
1510	unix_state_runlock(other);
1511	kfree_skb(skb);
1512pipe_err:
1513	if (sent==0 && !(msg->msg_flags&MSG_NOSIGNAL))
1514		send_sig(SIGPIPE,current,0);
1515	err = -EPIPE;
1516out_err:
1517        if (other)
1518		sock_put(other);
1519	scm_destroy(siocb->scm);
1520	siocb->scm = NULL;
1521	return sent ? : err;
1522}
1523
1524static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1525				  struct msghdr *msg, size_t len)
1526{
1527	int err;
1528	struct sock *sk = sock->sk;
1529
1530	err = sock_error(sk);
1531	if (err)
1532		return err;
1533
1534	if (sk->sk_state != TCP_ESTABLISHED)
1535		return -ENOTCONN;
1536
1537	if (msg->msg_namelen)
1538		msg->msg_namelen = 0;
1539
1540	return unix_dgram_sendmsg(kiocb, sock, msg, len);
1541}
1542
1543static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1544{
1545	struct unix_sock *u = unix_sk(sk);
1546
1547	msg->msg_namelen = 0;
1548	if (u->addr) {
1549		msg->msg_namelen = u->addr->len;
1550		memcpy(msg->msg_name, u->addr->name, u->addr->len);
1551	}
1552}
1553
1554static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1555			      struct msghdr *msg, size_t size,
1556			      int flags)
1557{
1558	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1559	struct scm_cookie tmp_scm;
1560	struct sock *sk = sock->sk;
1561	struct unix_sock *u = unix_sk(sk);
1562	int noblock = flags & MSG_DONTWAIT;
1563	struct sk_buff *skb;
1564	int err;
1565
1566	err = -EOPNOTSUPP;
1567	if (flags&MSG_OOB)
1568		goto out;
1569
1570	msg->msg_namelen = 0;
1571
1572	down(&u->readsem);
1573
1574	skb = skb_recv_datagram(sk, flags, noblock, &err);
1575	if (!skb)
1576		goto out_unlock;
1577
1578	wake_up_interruptible(&u->peer_wait);
1579
1580	if (msg->msg_name)
1581		unix_copy_addr(msg, skb->sk);
1582
1583	if (size > skb->len)
1584		size = skb->len;
1585	else if (size < skb->len)
1586		msg->msg_flags |= MSG_TRUNC;
1587
1588	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size);
1589	if (err)
1590		goto out_free;
1591
1592	if (!siocb->scm) {
1593		siocb->scm = &tmp_scm;
1594		memset(&tmp_scm, 0, sizeof(tmp_scm));
1595	}
1596	siocb->scm->creds = *UNIXCREDS(skb);
1597
1598	if (!(flags & MSG_PEEK))
1599	{
1600		if (UNIXCB(skb).fp)
1601			unix_detach_fds(siocb->scm, skb);
1602	}
1603	else
1604	{
1605		/* It is questionable: on PEEK we could:
1606		   - do not return fds - good, but too simple 8)
1607		   - return fds, and do not return them on read (old strategy,
1608		     apparently wrong)
1609		   - clone fds (I chose it for now, it is the most universal
1610		     solution)
1611
1612	           POSIX 1003.1g does not actually define this clearly
1613	           at all. POSIX 1003.1g doesn't define a lot of things
1614	           clearly however!
1615
1616		*/
1617		if (UNIXCB(skb).fp)
1618			siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1619	}
1620	err = size;
1621
1622	scm_recv(sock, msg, siocb->scm, flags);
1623
1624out_free:
1625	skb_free_datagram(sk,skb);
1626out_unlock:
1627	up(&u->readsem);
1628out:
1629	return err;
1630}
1631
1632/*
1633 *	Sleep until data has arrive. But check for races..
1634 */
1635
1636static long unix_stream_data_wait(struct sock * sk, long timeo)
1637{
1638	DEFINE_WAIT(wait);
1639
1640	unix_state_rlock(sk);
1641
1642	for (;;) {
1643		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1644
1645		if (skb_queue_len(&sk->sk_receive_queue) ||
1646		    sk->sk_err ||
1647		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1648		    signal_pending(current) ||
1649		    !timeo)
1650			break;
1651
1652		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1653		unix_state_runlock(sk);
1654		timeo = schedule_timeout(timeo);
1655		unix_state_rlock(sk);
1656		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1657	}
1658
1659	finish_wait(sk->sk_sleep, &wait);
1660	unix_state_runlock(sk);
1661	return timeo;
1662}
1663
1664
1665
1666static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1667			       struct msghdr *msg, size_t size,
1668			       int flags)
1669{
1670	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1671	struct scm_cookie tmp_scm;
1672	struct sock *sk = sock->sk;
1673	struct unix_sock *u = unix_sk(sk);
1674	struct sockaddr_un *sunaddr=msg->msg_name;
1675	int copied = 0;
1676	int check_creds = 0;
1677	int target;
1678	int err = 0;
1679	long timeo;
1680
1681	err = -EINVAL;
1682	if (sk->sk_state != TCP_ESTABLISHED)
1683		goto out;
1684
1685	err = -EOPNOTSUPP;
1686	if (flags&MSG_OOB)
1687		goto out;
1688
1689	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1690	timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1691
1692	msg->msg_namelen = 0;
1693
1694	/* Lock the socket to prevent queue disordering
1695	 * while sleeps in memcpy_tomsg
1696	 */
1697
1698	if (!siocb->scm) {
1699		siocb->scm = &tmp_scm;
1700		memset(&tmp_scm, 0, sizeof(tmp_scm));
1701	}
1702
1703	down(&u->readsem);
1704
1705	do
1706	{
1707		int chunk;
1708		struct sk_buff *skb;
1709
1710		skb = skb_dequeue(&sk->sk_receive_queue);
1711		if (skb==NULL)
1712		{
1713			if (copied >= target)
1714				break;
1715
1716			/*
1717			 *	POSIX 1003.1g mandates this order.
1718			 */
1719
1720			if ((err = sock_error(sk)) != 0)
1721				break;
1722			if (sk->sk_shutdown & RCV_SHUTDOWN)
1723				break;
1724			err = -EAGAIN;
1725			if (!timeo)
1726				break;
1727			up(&u->readsem);
1728
1729			timeo = unix_stream_data_wait(sk, timeo);
1730
1731			if (signal_pending(current)) {
1732				err = sock_intr_errno(timeo);
1733				goto out;
1734			}
1735			down(&u->readsem);
1736			continue;
1737		}
1738
1739		if (check_creds) {
1740			/* Never glue messages from different writers */
1741			if (memcmp(UNIXCREDS(skb), &siocb->scm->creds, sizeof(siocb->scm->creds)) != 0) {
1742				skb_queue_head(&sk->sk_receive_queue, skb);
1743				break;
1744			}
1745		} else {
1746			/* Copy credentials */
1747			siocb->scm->creds = *UNIXCREDS(skb);
1748			check_creds = 1;
1749		}
1750
1751		/* Copy address just once */
1752		if (sunaddr)
1753		{
1754			unix_copy_addr(msg, skb->sk);
1755			sunaddr = NULL;
1756		}
1757
1758		chunk = min_t(unsigned int, skb->len, size);
1759		if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
1760			skb_queue_head(&sk->sk_receive_queue, skb);
1761			if (copied == 0)
1762				copied = -EFAULT;
1763			break;
1764		}
1765		copied += chunk;
1766		size -= chunk;
1767
1768		/* Mark read part of skb as used */
1769		if (!(flags & MSG_PEEK))
1770		{
1771			skb_pull(skb, chunk);
1772
1773			if (UNIXCB(skb).fp)
1774				unix_detach_fds(siocb->scm, skb);
1775
1776			/* put the skb back if we didn't use it up.. */
1777			if (skb->len)
1778			{
1779				skb_queue_head(&sk->sk_receive_queue, skb);
1780				break;
1781			}
1782
1783			kfree_skb(skb);
1784
1785			if (siocb->scm->fp)
1786				break;
1787		}
1788		else
1789		{
1790			/* It is questionable, see note in unix_dgram_recvmsg.
1791			 */
1792			if (UNIXCB(skb).fp)
1793				siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1794
1795			/* put message back and return */
1796			skb_queue_head(&sk->sk_receive_queue, skb);
1797			break;
1798		}
1799	} while (size);
1800
1801	up(&u->readsem);
1802	scm_recv(sock, msg, siocb->scm, flags);
1803out:
1804	return copied ? : err;
1805}
1806
1807static int unix_shutdown(struct socket *sock, int mode)
1808{
1809	struct sock *sk = sock->sk;
1810	struct sock *other;
1811
1812	mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
1813
1814	if (mode) {
1815		unix_state_wlock(sk);
1816		sk->sk_shutdown |= mode;
1817		other=unix_peer(sk);
1818		if (other)
1819			sock_hold(other);
1820		unix_state_wunlock(sk);
1821		sk->sk_state_change(sk);
1822
1823		if (other &&
1824			(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
1825
1826			int peer_mode = 0;
1827
1828			if (mode&RCV_SHUTDOWN)
1829				peer_mode |= SEND_SHUTDOWN;
1830			if (mode&SEND_SHUTDOWN)
1831				peer_mode |= RCV_SHUTDOWN;
1832			unix_state_wlock(other);
1833			other->sk_shutdown |= peer_mode;
1834			unix_state_wunlock(other);
1835			other->sk_state_change(other);
1836			read_lock(&other->sk_callback_lock);
1837			if (peer_mode == SHUTDOWN_MASK)
1838				sk_wake_async(other,1,POLL_HUP);
1839			else if (peer_mode & RCV_SHUTDOWN)
1840				sk_wake_async(other,1,POLL_IN);
1841			read_unlock(&other->sk_callback_lock);
1842		}
1843		if (other)
1844			sock_put(other);
1845	}
1846	return 0;
1847}
1848
1849static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1850{
1851	struct sock *sk = sock->sk;
1852	long amount=0;
1853	int err;
1854
1855	switch(cmd)
1856	{
1857		case SIOCOUTQ:
1858			amount = atomic_read(&sk->sk_wmem_alloc);
1859			err = put_user(amount, (int __user *)arg);
1860			break;
1861		case SIOCINQ:
1862		{
1863			struct sk_buff *skb;
1864
1865			if (sk->sk_state == TCP_LISTEN) {
1866				err = -EINVAL;
1867				break;
1868			}
1869
1870			spin_lock(&sk->sk_receive_queue.lock);
1871			if (sk->sk_type == SOCK_STREAM ||
1872			    sk->sk_type == SOCK_SEQPACKET) {
1873				skb_queue_walk(&sk->sk_receive_queue, skb)
1874					amount += skb->len;
1875			} else {
1876				skb = skb_peek(&sk->sk_receive_queue);
1877				if (skb)
1878					amount=skb->len;
1879			}
1880			spin_unlock(&sk->sk_receive_queue.lock);
1881			err = put_user(amount, (int __user *)arg);
1882			break;
1883		}
1884
1885		default:
1886			err = dev_ioctl(cmd, (void __user *)arg);
1887			break;
1888	}
1889	return err;
1890}
1891
1892static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait)
1893{
1894	struct sock *sk = sock->sk;
1895	unsigned int mask;
1896
1897	poll_wait(file, sk->sk_sleep, wait);
1898	mask = 0;
1899
1900	/* exceptional events? */
1901	if (sk->sk_err)
1902		mask |= POLLERR;
1903	if (sk->sk_shutdown == SHUTDOWN_MASK)
1904		mask |= POLLHUP;
1905
1906	/* readable? */
1907	if (!skb_queue_empty(&sk->sk_receive_queue) ||
1908	    (sk->sk_shutdown & RCV_SHUTDOWN))
1909		mask |= POLLIN | POLLRDNORM;
1910
1911	/* Connection-based need to check for termination and startup */
1912	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && sk->sk_state == TCP_CLOSE)
1913		mask |= POLLHUP;
1914
1915	/*
1916	 * we set writable also when the other side has shut down the
1917	 * connection. This prevents stuck sockets.
1918	 */
1919	if (unix_writable(sk))
1920		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
1921
1922	return mask;
1923}
1924
1925
1926#ifdef CONFIG_PROC_FS
1927static struct sock *unix_seq_idx(int *iter, loff_t pos)
1928{
1929	loff_t off = 0;
1930	struct sock *s;
1931
1932	for (s = first_unix_socket(iter); s; s = next_unix_socket(iter, s)) {
1933		if (off == pos)
1934			return s;
1935		++off;
1936	}
1937	return NULL;
1938}
1939
1940
1941static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
1942{
1943	read_lock(&unix_table_lock);
1944	return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
1945}
1946
1947static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1948{
1949	++*pos;
1950
1951	if (v == (void *)1)
1952		return first_unix_socket(seq->private);
1953	return next_unix_socket(seq->private, v);
1954}
1955
1956static void unix_seq_stop(struct seq_file *seq, void *v)
1957{
1958	read_unlock(&unix_table_lock);
1959}
1960
1961static int unix_seq_show(struct seq_file *seq, void *v)
1962{
1963
1964	if (v == (void *)1)
1965		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
1966			 "Inode Path\n");
1967	else {
1968		struct sock *s = v;
1969		struct unix_sock *u = unix_sk(s);
1970		unix_state_rlock(s);
1971
1972		seq_printf(seq, "%p: %08X %08X %08X %04X %02X %5lu",
1973			s,
1974			atomic_read(&s->sk_refcnt),
1975			0,
1976			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
1977			s->sk_type,
1978			s->sk_socket ?
1979			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
1980			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
1981			sock_i_ino(s));
1982
1983		if (u->addr) {
1984			int i, len;
1985			seq_putc(seq, ' ');
1986
1987			i = 0;
1988			len = u->addr->len - sizeof(short);
1989			if (!UNIX_ABSTRACT(s))
1990				len--;
1991			else {
1992				seq_putc(seq, '@');
1993				i++;
1994			}
1995			for ( ; i < len; i++)
1996				seq_putc(seq, u->addr->name->sun_path[i]);
1997		}
1998		unix_state_runlock(s);
1999		seq_putc(seq, '\n');
2000	}
2001
2002	return 0;
2003}
2004
2005static struct seq_operations unix_seq_ops = {
2006	.start  = unix_seq_start,
2007	.next   = unix_seq_next,
2008	.stop   = unix_seq_stop,
2009	.show   = unix_seq_show,
2010};
2011
2012
2013static int unix_seq_open(struct inode *inode, struct file *file)
2014{
2015	struct seq_file *seq;
2016	int rc = -ENOMEM;
2017	int *iter = kmalloc(sizeof(int), GFP_KERNEL);
2018
2019	if (!iter)
2020		goto out;
2021
2022	rc = seq_open(file, &unix_seq_ops);
2023	if (rc)
2024		goto out_kfree;
2025
2026	seq	     = file->private_data;
2027	seq->private = iter;
2028	*iter = 0;
2029out:
2030	return rc;
2031out_kfree:
2032	kfree(iter);
2033	goto out;
2034}
2035
2036static struct file_operations unix_seq_fops = {
2037	.owner		= THIS_MODULE,
2038	.open		= unix_seq_open,
2039	.read		= seq_read,
2040	.llseek		= seq_lseek,
2041	.release	= seq_release_private,
2042};
2043
2044#endif
2045
2046static struct net_proto_family unix_family_ops = {
2047	.family = PF_UNIX,
2048	.create = unix_create,
2049	.owner	= THIS_MODULE,
2050};
2051
2052#ifdef CONFIG_SYSCTL
2053extern void unix_sysctl_register(void);
2054extern void unix_sysctl_unregister(void);
2055#else
2056static inline void unix_sysctl_register(void) {}
2057static inline void unix_sysctl_unregister(void) {}
2058#endif
2059
2060static int __init af_unix_init(void)
2061{
2062	int rc = -1;
2063	struct sk_buff *dummy_skb;
2064
2065	if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) {
2066		printk(KERN_CRIT "%s: panic\n", __FUNCTION__);
2067		goto out;
2068	}
2069
2070	rc = proto_register(&unix_proto, 1);
2071        if (rc != 0) {
2072                printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2073		       __FUNCTION__);
2074		goto out;
2075	}
2076
2077	sock_register(&unix_family_ops);
2078#ifdef CONFIG_PROC_FS
2079	proc_net_fops_create("unix", 0, &unix_seq_fops);
2080#endif
2081	unix_sysctl_register();
2082out:
2083	return rc;
2084}
2085
2086static void __exit af_unix_exit(void)
2087{
2088	sock_unregister(PF_UNIX);
2089	unix_sysctl_unregister();
2090	proc_net_remove("unix");
2091	proto_unregister(&unix_proto);
2092}
2093
2094module_init(af_unix_init);
2095module_exit(af_unix_exit);
2096
2097MODULE_LICENSE("GPL");
2098MODULE_ALIAS_NETPROTO(PF_UNIX);
2099