af_unix.c revision 1b1dcc1b57a49136f118a0f16367256ff9994a69
1/*
2 * NET4:	Implementation of BSD Unix domain sockets.
3 *
4 * Authors:	Alan Cox, <alan.cox@linux.org>
5 *
6 *		This program is free software; you can redistribute it and/or
7 *		modify it under the terms of the GNU General Public License
8 *		as published by the Free Software Foundation; either version
9 *		2 of the License, or (at your option) any later version.
10 *
11 * Version:	$Id: af_unix.c,v 1.133 2002/02/08 03:57:19 davem Exp $
12 *
13 * Fixes:
14 *		Linus Torvalds	:	Assorted bug cures.
15 *		Niibe Yutaka	:	async I/O support.
16 *		Carsten Paeth	:	PF_UNIX check, address fixes.
17 *		Alan Cox	:	Limit size of allocated blocks.
18 *		Alan Cox	:	Fixed the stupid socketpair bug.
19 *		Alan Cox	:	BSD compatibility fine tuning.
20 *		Alan Cox	:	Fixed a bug in connect when interrupted.
21 *		Alan Cox	:	Sorted out a proper draft version of
22 *					file descriptor passing hacked up from
23 *					Mike Shaver's work.
24 *		Marty Leisner	:	Fixes to fd passing
25 *		Nick Nevin	:	recvmsg bugfix.
26 *		Alan Cox	:	Started proper garbage collector
27 *		Heiko EiBfeldt	:	Missing verify_area check
28 *		Alan Cox	:	Started POSIXisms
29 *		Andreas Schwab	:	Replace inode by dentry for proper
30 *					reference counting
31 *		Kirk Petersen	:	Made this a module
32 *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
33 *					Lots of bug fixes.
34 *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
35 *					by above two patches.
36 *	     Andrea Arcangeli	:	If possible we block in connect(2)
37 *					if the max backlog of the listen socket
38 *					is been reached. This won't break
39 *					old apps and it will avoid huge amount
40 *					of socks hashed (this for unix_gc()
41 *					performances reasons).
42 *					Security fix that limits the max
43 *					number of socks to 2*max_files and
44 *					the number of skb queueable in the
45 *					dgram receiver.
46 *		Artur Skawina   :	Hash function optimizations
47 *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
48 *	      Malcolm Beattie   :	Set peercred for socketpair
49 *	     Michal Ostrowski   :       Module initialization cleanup.
50 *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
51 *	     				the core infrastructure is doing that
52 *	     				for all net proto families now (2.5.69+)
53 *
54 *
55 * Known differences from reference BSD that was tested:
56 *
57 *	[TO FIX]
58 *	ECONNREFUSED is not returned from one end of a connected() socket to the
59 *		other the moment one end closes.
60 *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
61 *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
62 *	[NOT TO FIX]
63 *	accept() returns a path name even if the connecting socket has closed
64 *		in the meantime (BSD loses the path and gives up).
65 *	accept() returns 0 length path for an unbound connector. BSD returns 16
66 *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
67 *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
68 *	BSD af_unix apparently has connect forgetting to block properly.
69 *		(need to check this with the POSIX spec in detail)
70 *
71 * Differences from 2.0.0-11-... (ANK)
72 *	Bug fixes and improvements.
73 *		- client shutdown killed server socket.
74 *		- removed all useless cli/sti pairs.
75 *
76 *	Semantic changes/extensions.
77 *		- generic control message passing.
78 *		- SCM_CREDENTIALS control message.
79 *		- "Abstract" (not FS based) socket bindings.
80 *		  Abstract names are sequences of bytes (not zero terminated)
81 *		  started by 0, so that this name space does not intersect
82 *		  with BSD names.
83 */
84
85#include <linux/module.h>
86#include <linux/config.h>
87#include <linux/kernel.h>
88#include <linux/signal.h>
89#include <linux/sched.h>
90#include <linux/errno.h>
91#include <linux/string.h>
92#include <linux/stat.h>
93#include <linux/dcache.h>
94#include <linux/namei.h>
95#include <linux/socket.h>
96#include <linux/un.h>
97#include <linux/fcntl.h>
98#include <linux/termios.h>
99#include <linux/sockios.h>
100#include <linux/net.h>
101#include <linux/in.h>
102#include <linux/fs.h>
103#include <linux/slab.h>
104#include <asm/uaccess.h>
105#include <linux/skbuff.h>
106#include <linux/netdevice.h>
107#include <net/sock.h>
108#include <net/tcp_states.h>
109#include <net/af_unix.h>
110#include <linux/proc_fs.h>
111#include <linux/seq_file.h>
112#include <net/scm.h>
113#include <linux/init.h>
114#include <linux/poll.h>
115#include <linux/smp_lock.h>
116#include <linux/rtnetlink.h>
117#include <linux/mount.h>
118#include <net/checksum.h>
119#include <linux/security.h>
120
121int sysctl_unix_max_dgram_qlen = 10;
122
123struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
124DEFINE_SPINLOCK(unix_table_lock);
125static atomic_t unix_nr_socks = ATOMIC_INIT(0);
126
127#define unix_sockets_unbound	(&unix_socket_table[UNIX_HASH_SIZE])
128
129#define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
130
131/*
132 *  SMP locking strategy:
133 *    hash table is protected with spinlock unix_table_lock
134 *    each socket state is protected by separate rwlock.
135 */
136
137static inline unsigned unix_hash_fold(unsigned hash)
138{
139	hash ^= hash>>16;
140	hash ^= hash>>8;
141	return hash&(UNIX_HASH_SIZE-1);
142}
143
144#define unix_peer(sk) (unix_sk(sk)->peer)
145
146static inline int unix_our_peer(struct sock *sk, struct sock *osk)
147{
148	return unix_peer(osk) == sk;
149}
150
151static inline int unix_may_send(struct sock *sk, struct sock *osk)
152{
153	return (unix_peer(osk) == NULL || unix_our_peer(sk, osk));
154}
155
156static struct sock *unix_peer_get(struct sock *s)
157{
158	struct sock *peer;
159
160	unix_state_rlock(s);
161	peer = unix_peer(s);
162	if (peer)
163		sock_hold(peer);
164	unix_state_runlock(s);
165	return peer;
166}
167
168static inline void unix_release_addr(struct unix_address *addr)
169{
170	if (atomic_dec_and_test(&addr->refcnt))
171		kfree(addr);
172}
173
174/*
175 *	Check unix socket name:
176 *		- should be not zero length.
177 *	        - if started by not zero, should be NULL terminated (FS object)
178 *		- if started by zero, it is abstract name.
179 */
180
181static int unix_mkname(struct sockaddr_un * sunaddr, int len, unsigned *hashp)
182{
183	if (len <= sizeof(short) || len > sizeof(*sunaddr))
184		return -EINVAL;
185	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
186		return -EINVAL;
187	if (sunaddr->sun_path[0]) {
188		/*
189		 * This may look like an off by one error but it is a bit more
190		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
191		 * sun_path[108] doesnt as such exist.  However in kernel space
192		 * we are guaranteed that it is a valid memory location in our
193		 * kernel address buffer.
194		 */
195		((char *)sunaddr)[len]=0;
196		len = strlen(sunaddr->sun_path)+1+sizeof(short);
197		return len;
198	}
199
200	*hashp = unix_hash_fold(csum_partial((char*)sunaddr, len, 0));
201	return len;
202}
203
204static void __unix_remove_socket(struct sock *sk)
205{
206	sk_del_node_init(sk);
207}
208
209static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
210{
211	BUG_TRAP(sk_unhashed(sk));
212	sk_add_node(sk, list);
213}
214
215static inline void unix_remove_socket(struct sock *sk)
216{
217	spin_lock(&unix_table_lock);
218	__unix_remove_socket(sk);
219	spin_unlock(&unix_table_lock);
220}
221
222static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
223{
224	spin_lock(&unix_table_lock);
225	__unix_insert_socket(list, sk);
226	spin_unlock(&unix_table_lock);
227}
228
229static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
230					      int len, int type, unsigned hash)
231{
232	struct sock *s;
233	struct hlist_node *node;
234
235	sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
236		struct unix_sock *u = unix_sk(s);
237
238		if (u->addr->len == len &&
239		    !memcmp(u->addr->name, sunname, len))
240			goto found;
241	}
242	s = NULL;
243found:
244	return s;
245}
246
247static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname,
248						   int len, int type,
249						   unsigned hash)
250{
251	struct sock *s;
252
253	spin_lock(&unix_table_lock);
254	s = __unix_find_socket_byname(sunname, len, type, hash);
255	if (s)
256		sock_hold(s);
257	spin_unlock(&unix_table_lock);
258	return s;
259}
260
261static struct sock *unix_find_socket_byinode(struct inode *i)
262{
263	struct sock *s;
264	struct hlist_node *node;
265
266	spin_lock(&unix_table_lock);
267	sk_for_each(s, node,
268		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
269		struct dentry *dentry = unix_sk(s)->dentry;
270
271		if(dentry && dentry->d_inode == i)
272		{
273			sock_hold(s);
274			goto found;
275		}
276	}
277	s = NULL;
278found:
279	spin_unlock(&unix_table_lock);
280	return s;
281}
282
283static inline int unix_writable(struct sock *sk)
284{
285	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
286}
287
288static void unix_write_space(struct sock *sk)
289{
290	read_lock(&sk->sk_callback_lock);
291	if (unix_writable(sk)) {
292		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
293			wake_up_interruptible(sk->sk_sleep);
294		sk_wake_async(sk, 2, POLL_OUT);
295	}
296	read_unlock(&sk->sk_callback_lock);
297}
298
299/* When dgram socket disconnects (or changes its peer), we clear its receive
300 * queue of packets arrived from previous peer. First, it allows to do
301 * flow control based only on wmem_alloc; second, sk connected to peer
302 * may receive messages only from that peer. */
303static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
304{
305	if (!skb_queue_empty(&sk->sk_receive_queue)) {
306		skb_queue_purge(&sk->sk_receive_queue);
307		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
308
309		/* If one link of bidirectional dgram pipe is disconnected,
310		 * we signal error. Messages are lost. Do not make this,
311		 * when peer was not connected to us.
312		 */
313		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
314			other->sk_err = ECONNRESET;
315			other->sk_error_report(other);
316		}
317	}
318}
319
320static void unix_sock_destructor(struct sock *sk)
321{
322	struct unix_sock *u = unix_sk(sk);
323
324	skb_queue_purge(&sk->sk_receive_queue);
325
326	BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
327	BUG_TRAP(sk_unhashed(sk));
328	BUG_TRAP(!sk->sk_socket);
329	if (!sock_flag(sk, SOCK_DEAD)) {
330		printk("Attempt to release alive unix socket: %p\n", sk);
331		return;
332	}
333
334	if (u->addr)
335		unix_release_addr(u->addr);
336
337	atomic_dec(&unix_nr_socks);
338#ifdef UNIX_REFCNT_DEBUG
339	printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, atomic_read(&unix_nr_socks));
340#endif
341}
342
343static int unix_release_sock (struct sock *sk, int embrion)
344{
345	struct unix_sock *u = unix_sk(sk);
346	struct dentry *dentry;
347	struct vfsmount *mnt;
348	struct sock *skpair;
349	struct sk_buff *skb;
350	int state;
351
352	unix_remove_socket(sk);
353
354	/* Clear state */
355	unix_state_wlock(sk);
356	sock_orphan(sk);
357	sk->sk_shutdown = SHUTDOWN_MASK;
358	dentry	     = u->dentry;
359	u->dentry    = NULL;
360	mnt	     = u->mnt;
361	u->mnt	     = NULL;
362	state = sk->sk_state;
363	sk->sk_state = TCP_CLOSE;
364	unix_state_wunlock(sk);
365
366	wake_up_interruptible_all(&u->peer_wait);
367
368	skpair=unix_peer(sk);
369
370	if (skpair!=NULL) {
371		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
372			unix_state_wlock(skpair);
373			/* No more writes */
374			skpair->sk_shutdown = SHUTDOWN_MASK;
375			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
376				skpair->sk_err = ECONNRESET;
377			unix_state_wunlock(skpair);
378			skpair->sk_state_change(skpair);
379			read_lock(&skpair->sk_callback_lock);
380			sk_wake_async(skpair,1,POLL_HUP);
381			read_unlock(&skpair->sk_callback_lock);
382		}
383		sock_put(skpair); /* It may now die */
384		unix_peer(sk) = NULL;
385	}
386
387	/* Try to flush out this socket. Throw out buffers at least */
388
389	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
390		if (state==TCP_LISTEN)
391			unix_release_sock(skb->sk, 1);
392		/* passed fds are erased in the kfree_skb hook	      */
393		kfree_skb(skb);
394	}
395
396	if (dentry) {
397		dput(dentry);
398		mntput(mnt);
399	}
400
401	sock_put(sk);
402
403	/* ---- Socket is dead now and most probably destroyed ---- */
404
405	/*
406	 * Fixme: BSD difference: In BSD all sockets connected to use get
407	 *	  ECONNRESET and we die on the spot. In Linux we behave
408	 *	  like files and pipes do and wait for the last
409	 *	  dereference.
410	 *
411	 * Can't we simply set sock->err?
412	 *
413	 *	  What the above comment does talk about? --ANK(980817)
414	 */
415
416	if (atomic_read(&unix_tot_inflight))
417		unix_gc();		/* Garbage collect fds */
418
419	return 0;
420}
421
422static int unix_listen(struct socket *sock, int backlog)
423{
424	int err;
425	struct sock *sk = sock->sk;
426	struct unix_sock *u = unix_sk(sk);
427
428	err = -EOPNOTSUPP;
429	if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
430		goto out;			/* Only stream/seqpacket sockets accept */
431	err = -EINVAL;
432	if (!u->addr)
433		goto out;			/* No listens on an unbound socket */
434	unix_state_wlock(sk);
435	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
436		goto out_unlock;
437	if (backlog > sk->sk_max_ack_backlog)
438		wake_up_interruptible_all(&u->peer_wait);
439	sk->sk_max_ack_backlog	= backlog;
440	sk->sk_state		= TCP_LISTEN;
441	/* set credentials so connect can copy them */
442	sk->sk_peercred.pid	= current->tgid;
443	sk->sk_peercred.uid	= current->euid;
444	sk->sk_peercred.gid	= current->egid;
445	err = 0;
446
447out_unlock:
448	unix_state_wunlock(sk);
449out:
450	return err;
451}
452
453static int unix_release(struct socket *);
454static int unix_bind(struct socket *, struct sockaddr *, int);
455static int unix_stream_connect(struct socket *, struct sockaddr *,
456			       int addr_len, int flags);
457static int unix_socketpair(struct socket *, struct socket *);
458static int unix_accept(struct socket *, struct socket *, int);
459static int unix_getname(struct socket *, struct sockaddr *, int *, int);
460static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
461static int unix_ioctl(struct socket *, unsigned int, unsigned long);
462static int unix_shutdown(struct socket *, int);
463static int unix_stream_sendmsg(struct kiocb *, struct socket *,
464			       struct msghdr *, size_t);
465static int unix_stream_recvmsg(struct kiocb *, struct socket *,
466			       struct msghdr *, size_t, int);
467static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
468			      struct msghdr *, size_t);
469static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
470			      struct msghdr *, size_t, int);
471static int unix_dgram_connect(struct socket *, struct sockaddr *,
472			      int, int);
473static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
474				  struct msghdr *, size_t);
475
476static const struct proto_ops unix_stream_ops = {
477	.family =	PF_UNIX,
478	.owner =	THIS_MODULE,
479	.release =	unix_release,
480	.bind =		unix_bind,
481	.connect =	unix_stream_connect,
482	.socketpair =	unix_socketpair,
483	.accept =	unix_accept,
484	.getname =	unix_getname,
485	.poll =		unix_poll,
486	.ioctl =	unix_ioctl,
487	.listen =	unix_listen,
488	.shutdown =	unix_shutdown,
489	.setsockopt =	sock_no_setsockopt,
490	.getsockopt =	sock_no_getsockopt,
491	.sendmsg =	unix_stream_sendmsg,
492	.recvmsg =	unix_stream_recvmsg,
493	.mmap =		sock_no_mmap,
494	.sendpage =	sock_no_sendpage,
495};
496
497static const struct proto_ops unix_dgram_ops = {
498	.family =	PF_UNIX,
499	.owner =	THIS_MODULE,
500	.release =	unix_release,
501	.bind =		unix_bind,
502	.connect =	unix_dgram_connect,
503	.socketpair =	unix_socketpair,
504	.accept =	sock_no_accept,
505	.getname =	unix_getname,
506	.poll =		datagram_poll,
507	.ioctl =	unix_ioctl,
508	.listen =	sock_no_listen,
509	.shutdown =	unix_shutdown,
510	.setsockopt =	sock_no_setsockopt,
511	.getsockopt =	sock_no_getsockopt,
512	.sendmsg =	unix_dgram_sendmsg,
513	.recvmsg =	unix_dgram_recvmsg,
514	.mmap =		sock_no_mmap,
515	.sendpage =	sock_no_sendpage,
516};
517
518static const struct proto_ops unix_seqpacket_ops = {
519	.family =	PF_UNIX,
520	.owner =	THIS_MODULE,
521	.release =	unix_release,
522	.bind =		unix_bind,
523	.connect =	unix_stream_connect,
524	.socketpair =	unix_socketpair,
525	.accept =	unix_accept,
526	.getname =	unix_getname,
527	.poll =		datagram_poll,
528	.ioctl =	unix_ioctl,
529	.listen =	unix_listen,
530	.shutdown =	unix_shutdown,
531	.setsockopt =	sock_no_setsockopt,
532	.getsockopt =	sock_no_getsockopt,
533	.sendmsg =	unix_seqpacket_sendmsg,
534	.recvmsg =	unix_dgram_recvmsg,
535	.mmap =		sock_no_mmap,
536	.sendpage =	sock_no_sendpage,
537};
538
539static struct proto unix_proto = {
540	.name	  = "UNIX",
541	.owner	  = THIS_MODULE,
542	.obj_size = sizeof(struct unix_sock),
543};
544
545static struct sock * unix_create1(struct socket *sock)
546{
547	struct sock *sk = NULL;
548	struct unix_sock *u;
549
550	if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files)
551		goto out;
552
553	sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1);
554	if (!sk)
555		goto out;
556
557	atomic_inc(&unix_nr_socks);
558
559	sock_init_data(sock,sk);
560
561	sk->sk_write_space	= unix_write_space;
562	sk->sk_max_ack_backlog	= sysctl_unix_max_dgram_qlen;
563	sk->sk_destruct		= unix_sock_destructor;
564	u	  = unix_sk(sk);
565	u->dentry = NULL;
566	u->mnt	  = NULL;
567	spin_lock_init(&u->lock);
568	atomic_set(&u->inflight, sock ? 0 : -1);
569	init_MUTEX(&u->readsem); /* single task reading lock */
570	init_waitqueue_head(&u->peer_wait);
571	unix_insert_socket(unix_sockets_unbound, sk);
572out:
573	return sk;
574}
575
576static int unix_create(struct socket *sock, int protocol)
577{
578	if (protocol && protocol != PF_UNIX)
579		return -EPROTONOSUPPORT;
580
581	sock->state = SS_UNCONNECTED;
582
583	switch (sock->type) {
584	case SOCK_STREAM:
585		sock->ops = &unix_stream_ops;
586		break;
587		/*
588		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
589		 *	nothing uses it.
590		 */
591	case SOCK_RAW:
592		sock->type=SOCK_DGRAM;
593	case SOCK_DGRAM:
594		sock->ops = &unix_dgram_ops;
595		break;
596	case SOCK_SEQPACKET:
597		sock->ops = &unix_seqpacket_ops;
598		break;
599	default:
600		return -ESOCKTNOSUPPORT;
601	}
602
603	return unix_create1(sock) ? 0 : -ENOMEM;
604}
605
606static int unix_release(struct socket *sock)
607{
608	struct sock *sk = sock->sk;
609
610	if (!sk)
611		return 0;
612
613	sock->sk = NULL;
614
615	return unix_release_sock (sk, 0);
616}
617
618static int unix_autobind(struct socket *sock)
619{
620	struct sock *sk = sock->sk;
621	struct unix_sock *u = unix_sk(sk);
622	static u32 ordernum = 1;
623	struct unix_address * addr;
624	int err;
625
626	down(&u->readsem);
627
628	err = 0;
629	if (u->addr)
630		goto out;
631
632	err = -ENOMEM;
633	addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
634	if (!addr)
635		goto out;
636
637	memset(addr, 0, sizeof(*addr) + sizeof(short) + 16);
638	addr->name->sun_family = AF_UNIX;
639	atomic_set(&addr->refcnt, 1);
640
641retry:
642	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
643	addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
644
645	spin_lock(&unix_table_lock);
646	ordernum = (ordernum+1)&0xFFFFF;
647
648	if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
649				      addr->hash)) {
650		spin_unlock(&unix_table_lock);
651		/* Sanity yield. It is unusual case, but yet... */
652		if (!(ordernum&0xFF))
653			yield();
654		goto retry;
655	}
656	addr->hash ^= sk->sk_type;
657
658	__unix_remove_socket(sk);
659	u->addr = addr;
660	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
661	spin_unlock(&unix_table_lock);
662	err = 0;
663
664out:	up(&u->readsem);
665	return err;
666}
667
668static struct sock *unix_find_other(struct sockaddr_un *sunname, int len,
669				    int type, unsigned hash, int *error)
670{
671	struct sock *u;
672	struct nameidata nd;
673	int err = 0;
674
675	if (sunname->sun_path[0]) {
676		err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
677		if (err)
678			goto fail;
679		err = vfs_permission(&nd, MAY_WRITE);
680		if (err)
681			goto put_fail;
682
683		err = -ECONNREFUSED;
684		if (!S_ISSOCK(nd.dentry->d_inode->i_mode))
685			goto put_fail;
686		u=unix_find_socket_byinode(nd.dentry->d_inode);
687		if (!u)
688			goto put_fail;
689
690		if (u->sk_type == type)
691			touch_atime(nd.mnt, nd.dentry);
692
693		path_release(&nd);
694
695		err=-EPROTOTYPE;
696		if (u->sk_type != type) {
697			sock_put(u);
698			goto fail;
699		}
700	} else {
701		err = -ECONNREFUSED;
702		u=unix_find_socket_byname(sunname, len, type, hash);
703		if (u) {
704			struct dentry *dentry;
705			dentry = unix_sk(u)->dentry;
706			if (dentry)
707				touch_atime(unix_sk(u)->mnt, dentry);
708		} else
709			goto fail;
710	}
711	return u;
712
713put_fail:
714	path_release(&nd);
715fail:
716	*error=err;
717	return NULL;
718}
719
720
721static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
722{
723	struct sock *sk = sock->sk;
724	struct unix_sock *u = unix_sk(sk);
725	struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
726	struct dentry * dentry = NULL;
727	struct nameidata nd;
728	int err;
729	unsigned hash;
730	struct unix_address *addr;
731	struct hlist_head *list;
732
733	err = -EINVAL;
734	if (sunaddr->sun_family != AF_UNIX)
735		goto out;
736
737	if (addr_len==sizeof(short)) {
738		err = unix_autobind(sock);
739		goto out;
740	}
741
742	err = unix_mkname(sunaddr, addr_len, &hash);
743	if (err < 0)
744		goto out;
745	addr_len = err;
746
747	down(&u->readsem);
748
749	err = -EINVAL;
750	if (u->addr)
751		goto out_up;
752
753	err = -ENOMEM;
754	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
755	if (!addr)
756		goto out_up;
757
758	memcpy(addr->name, sunaddr, addr_len);
759	addr->len = addr_len;
760	addr->hash = hash ^ sk->sk_type;
761	atomic_set(&addr->refcnt, 1);
762
763	if (sunaddr->sun_path[0]) {
764		unsigned int mode;
765		err = 0;
766		/*
767		 * Get the parent directory, calculate the hash for last
768		 * component.
769		 */
770		err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd);
771		if (err)
772			goto out_mknod_parent;
773
774		dentry = lookup_create(&nd, 0);
775		err = PTR_ERR(dentry);
776		if (IS_ERR(dentry))
777			goto out_mknod_unlock;
778
779		/*
780		 * All right, let's create it.
781		 */
782		mode = S_IFSOCK |
783		       (SOCK_INODE(sock)->i_mode & ~current->fs->umask);
784		err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
785		if (err)
786			goto out_mknod_dput;
787		mutex_unlock(&nd.dentry->d_inode->i_mutex);
788		dput(nd.dentry);
789		nd.dentry = dentry;
790
791		addr->hash = UNIX_HASH_SIZE;
792	}
793
794	spin_lock(&unix_table_lock);
795
796	if (!sunaddr->sun_path[0]) {
797		err = -EADDRINUSE;
798		if (__unix_find_socket_byname(sunaddr, addr_len,
799					      sk->sk_type, hash)) {
800			unix_release_addr(addr);
801			goto out_unlock;
802		}
803
804		list = &unix_socket_table[addr->hash];
805	} else {
806		list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
807		u->dentry = nd.dentry;
808		u->mnt    = nd.mnt;
809	}
810
811	err = 0;
812	__unix_remove_socket(sk);
813	u->addr = addr;
814	__unix_insert_socket(list, sk);
815
816out_unlock:
817	spin_unlock(&unix_table_lock);
818out_up:
819	up(&u->readsem);
820out:
821	return err;
822
823out_mknod_dput:
824	dput(dentry);
825out_mknod_unlock:
826	mutex_unlock(&nd.dentry->d_inode->i_mutex);
827	path_release(&nd);
828out_mknod_parent:
829	if (err==-EEXIST)
830		err=-EADDRINUSE;
831	unix_release_addr(addr);
832	goto out_up;
833}
834
835static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
836			      int alen, int flags)
837{
838	struct sock *sk = sock->sk;
839	struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr;
840	struct sock *other;
841	unsigned hash;
842	int err;
843
844	if (addr->sa_family != AF_UNSPEC) {
845		err = unix_mkname(sunaddr, alen, &hash);
846		if (err < 0)
847			goto out;
848		alen = err;
849
850		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
851		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
852			goto out;
853
854		other=unix_find_other(sunaddr, alen, sock->type, hash, &err);
855		if (!other)
856			goto out;
857
858		unix_state_wlock(sk);
859
860		err = -EPERM;
861		if (!unix_may_send(sk, other))
862			goto out_unlock;
863
864		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
865		if (err)
866			goto out_unlock;
867
868	} else {
869		/*
870		 *	1003.1g breaking connected state with AF_UNSPEC
871		 */
872		other = NULL;
873		unix_state_wlock(sk);
874	}
875
876	/*
877	 * If it was connected, reconnect.
878	 */
879	if (unix_peer(sk)) {
880		struct sock *old_peer = unix_peer(sk);
881		unix_peer(sk)=other;
882		unix_state_wunlock(sk);
883
884		if (other != old_peer)
885			unix_dgram_disconnected(sk, old_peer);
886		sock_put(old_peer);
887	} else {
888		unix_peer(sk)=other;
889		unix_state_wunlock(sk);
890	}
891 	return 0;
892
893out_unlock:
894	unix_state_wunlock(sk);
895	sock_put(other);
896out:
897	return err;
898}
899
900static long unix_wait_for_peer(struct sock *other, long timeo)
901{
902	struct unix_sock *u = unix_sk(other);
903	int sched;
904	DEFINE_WAIT(wait);
905
906	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
907
908	sched = !sock_flag(other, SOCK_DEAD) &&
909		!(other->sk_shutdown & RCV_SHUTDOWN) &&
910		(skb_queue_len(&other->sk_receive_queue) >
911		 other->sk_max_ack_backlog);
912
913	unix_state_runlock(other);
914
915	if (sched)
916		timeo = schedule_timeout(timeo);
917
918	finish_wait(&u->peer_wait, &wait);
919	return timeo;
920}
921
922static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
923			       int addr_len, int flags)
924{
925	struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
926	struct sock *sk = sock->sk;
927	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
928	struct sock *newsk = NULL;
929	struct sock *other = NULL;
930	struct sk_buff *skb = NULL;
931	unsigned hash;
932	int st;
933	int err;
934	long timeo;
935
936	err = unix_mkname(sunaddr, addr_len, &hash);
937	if (err < 0)
938		goto out;
939	addr_len = err;
940
941	if (test_bit(SOCK_PASSCRED, &sock->flags)
942		&& !u->addr && (err = unix_autobind(sock)) != 0)
943		goto out;
944
945	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
946
947	/* First of all allocate resources.
948	   If we will make it after state is locked,
949	   we will have to recheck all again in any case.
950	 */
951
952	err = -ENOMEM;
953
954	/* create new sock for complete connection */
955	newsk = unix_create1(NULL);
956	if (newsk == NULL)
957		goto out;
958
959	/* Allocate skb for sending to listening sock */
960	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
961	if (skb == NULL)
962		goto out;
963
964restart:
965	/*  Find listening sock. */
966	other = unix_find_other(sunaddr, addr_len, sk->sk_type, hash, &err);
967	if (!other)
968		goto out;
969
970	/* Latch state of peer */
971	unix_state_rlock(other);
972
973	/* Apparently VFS overslept socket death. Retry. */
974	if (sock_flag(other, SOCK_DEAD)) {
975		unix_state_runlock(other);
976		sock_put(other);
977		goto restart;
978	}
979
980	err = -ECONNREFUSED;
981	if (other->sk_state != TCP_LISTEN)
982		goto out_unlock;
983
984	if (skb_queue_len(&other->sk_receive_queue) >
985	    other->sk_max_ack_backlog) {
986		err = -EAGAIN;
987		if (!timeo)
988			goto out_unlock;
989
990		timeo = unix_wait_for_peer(other, timeo);
991
992		err = sock_intr_errno(timeo);
993		if (signal_pending(current))
994			goto out;
995		sock_put(other);
996		goto restart;
997        }
998
999	/* Latch our state.
1000
1001	   It is tricky place. We need to grab write lock and cannot
1002	   drop lock on peer. It is dangerous because deadlock is
1003	   possible. Connect to self case and simultaneous
1004	   attempt to connect are eliminated by checking socket
1005	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1006	   check this before attempt to grab lock.
1007
1008	   Well, and we have to recheck the state after socket locked.
1009	 */
1010	st = sk->sk_state;
1011
1012	switch (st) {
1013	case TCP_CLOSE:
1014		/* This is ok... continue with connect */
1015		break;
1016	case TCP_ESTABLISHED:
1017		/* Socket is already connected */
1018		err = -EISCONN;
1019		goto out_unlock;
1020	default:
1021		err = -EINVAL;
1022		goto out_unlock;
1023	}
1024
1025	unix_state_wlock(sk);
1026
1027	if (sk->sk_state != st) {
1028		unix_state_wunlock(sk);
1029		unix_state_runlock(other);
1030		sock_put(other);
1031		goto restart;
1032	}
1033
1034	err = security_unix_stream_connect(sock, other->sk_socket, newsk);
1035	if (err) {
1036		unix_state_wunlock(sk);
1037		goto out_unlock;
1038	}
1039
1040	/* The way is open! Fastly set all the necessary fields... */
1041
1042	sock_hold(sk);
1043	unix_peer(newsk)	= sk;
1044	newsk->sk_state		= TCP_ESTABLISHED;
1045	newsk->sk_type		= sk->sk_type;
1046	newsk->sk_peercred.pid	= current->tgid;
1047	newsk->sk_peercred.uid	= current->euid;
1048	newsk->sk_peercred.gid	= current->egid;
1049	newu = unix_sk(newsk);
1050	newsk->sk_sleep		= &newu->peer_wait;
1051	otheru = unix_sk(other);
1052
1053	/* copy address information from listening to new sock*/
1054	if (otheru->addr) {
1055		atomic_inc(&otheru->addr->refcnt);
1056		newu->addr = otheru->addr;
1057	}
1058	if (otheru->dentry) {
1059		newu->dentry	= dget(otheru->dentry);
1060		newu->mnt	= mntget(otheru->mnt);
1061	}
1062
1063	/* Set credentials */
1064	sk->sk_peercred = other->sk_peercred;
1065
1066	sock->state	= SS_CONNECTED;
1067	sk->sk_state	= TCP_ESTABLISHED;
1068	sock_hold(newsk);
1069
1070	smp_mb__after_atomic_inc();	/* sock_hold() does an atomic_inc() */
1071	unix_peer(sk)	= newsk;
1072
1073	unix_state_wunlock(sk);
1074
1075	/* take ten and and send info to listening sock */
1076	spin_lock(&other->sk_receive_queue.lock);
1077	__skb_queue_tail(&other->sk_receive_queue, skb);
1078	/* Undo artificially decreased inflight after embrion
1079	 * is installed to listening socket. */
1080	atomic_inc(&newu->inflight);
1081	spin_unlock(&other->sk_receive_queue.lock);
1082	unix_state_runlock(other);
1083	other->sk_data_ready(other, 0);
1084	sock_put(other);
1085	return 0;
1086
1087out_unlock:
1088	if (other)
1089		unix_state_runlock(other);
1090
1091out:
1092	if (skb)
1093		kfree_skb(skb);
1094	if (newsk)
1095		unix_release_sock(newsk, 0);
1096	if (other)
1097		sock_put(other);
1098	return err;
1099}
1100
1101static int unix_socketpair(struct socket *socka, struct socket *sockb)
1102{
1103	struct sock *ska=socka->sk, *skb = sockb->sk;
1104
1105	/* Join our sockets back to back */
1106	sock_hold(ska);
1107	sock_hold(skb);
1108	unix_peer(ska)=skb;
1109	unix_peer(skb)=ska;
1110	ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid;
1111	ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid;
1112	ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid;
1113
1114	if (ska->sk_type != SOCK_DGRAM) {
1115		ska->sk_state = TCP_ESTABLISHED;
1116		skb->sk_state = TCP_ESTABLISHED;
1117		socka->state  = SS_CONNECTED;
1118		sockb->state  = SS_CONNECTED;
1119	}
1120	return 0;
1121}
1122
1123static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1124{
1125	struct sock *sk = sock->sk;
1126	struct sock *tsk;
1127	struct sk_buff *skb;
1128	int err;
1129
1130	err = -EOPNOTSUPP;
1131	if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
1132		goto out;
1133
1134	err = -EINVAL;
1135	if (sk->sk_state != TCP_LISTEN)
1136		goto out;
1137
1138	/* If socket state is TCP_LISTEN it cannot change (for now...),
1139	 * so that no locks are necessary.
1140	 */
1141
1142	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1143	if (!skb) {
1144		/* This means receive shutdown. */
1145		if (err == 0)
1146			err = -EINVAL;
1147		goto out;
1148	}
1149
1150	tsk = skb->sk;
1151	skb_free_datagram(sk, skb);
1152	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1153
1154	/* attach accepted sock to socket */
1155	unix_state_wlock(tsk);
1156	newsock->state = SS_CONNECTED;
1157	sock_graft(tsk, newsock);
1158	unix_state_wunlock(tsk);
1159	return 0;
1160
1161out:
1162	return err;
1163}
1164
1165
1166static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1167{
1168	struct sock *sk = sock->sk;
1169	struct unix_sock *u;
1170	struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
1171	int err = 0;
1172
1173	if (peer) {
1174		sk = unix_peer_get(sk);
1175
1176		err = -ENOTCONN;
1177		if (!sk)
1178			goto out;
1179		err = 0;
1180	} else {
1181		sock_hold(sk);
1182	}
1183
1184	u = unix_sk(sk);
1185	unix_state_rlock(sk);
1186	if (!u->addr) {
1187		sunaddr->sun_family = AF_UNIX;
1188		sunaddr->sun_path[0] = 0;
1189		*uaddr_len = sizeof(short);
1190	} else {
1191		struct unix_address *addr = u->addr;
1192
1193		*uaddr_len = addr->len;
1194		memcpy(sunaddr, addr->name, *uaddr_len);
1195	}
1196	unix_state_runlock(sk);
1197	sock_put(sk);
1198out:
1199	return err;
1200}
1201
1202static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1203{
1204	int i;
1205
1206	scm->fp = UNIXCB(skb).fp;
1207	skb->destructor = sock_wfree;
1208	UNIXCB(skb).fp = NULL;
1209
1210	for (i=scm->fp->count-1; i>=0; i--)
1211		unix_notinflight(scm->fp->fp[i]);
1212}
1213
1214static void unix_destruct_fds(struct sk_buff *skb)
1215{
1216	struct scm_cookie scm;
1217	memset(&scm, 0, sizeof(scm));
1218	unix_detach_fds(&scm, skb);
1219
1220	/* Alas, it calls VFS */
1221	/* So fscking what? fput() had been SMP-safe since the last Summer */
1222	scm_destroy(&scm);
1223	sock_wfree(skb);
1224}
1225
1226static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1227{
1228	int i;
1229	for (i=scm->fp->count-1; i>=0; i--)
1230		unix_inflight(scm->fp->fp[i]);
1231	UNIXCB(skb).fp = scm->fp;
1232	skb->destructor = unix_destruct_fds;
1233	scm->fp = NULL;
1234}
1235
1236/*
1237 *	Send AF_UNIX data.
1238 */
1239
1240static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1241			      struct msghdr *msg, size_t len)
1242{
1243	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1244	struct sock *sk = sock->sk;
1245	struct unix_sock *u = unix_sk(sk);
1246	struct sockaddr_un *sunaddr=msg->msg_name;
1247	struct sock *other = NULL;
1248	int namelen = 0; /* fake GCC */
1249	int err;
1250	unsigned hash;
1251	struct sk_buff *skb;
1252	long timeo;
1253	struct scm_cookie tmp_scm;
1254
1255	if (NULL == siocb->scm)
1256		siocb->scm = &tmp_scm;
1257	err = scm_send(sock, msg, siocb->scm);
1258	if (err < 0)
1259		return err;
1260
1261	err = -EOPNOTSUPP;
1262	if (msg->msg_flags&MSG_OOB)
1263		goto out;
1264
1265	if (msg->msg_namelen) {
1266		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1267		if (err < 0)
1268			goto out;
1269		namelen = err;
1270	} else {
1271		sunaddr = NULL;
1272		err = -ENOTCONN;
1273		other = unix_peer_get(sk);
1274		if (!other)
1275			goto out;
1276	}
1277
1278	if (test_bit(SOCK_PASSCRED, &sock->flags)
1279		&& !u->addr && (err = unix_autobind(sock)) != 0)
1280		goto out;
1281
1282	err = -EMSGSIZE;
1283	if (len > sk->sk_sndbuf - 32)
1284		goto out;
1285
1286	skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err);
1287	if (skb==NULL)
1288		goto out;
1289
1290	memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1291	if (siocb->scm->fp)
1292		unix_attach_fds(siocb->scm, skb);
1293
1294	skb->h.raw = skb->data;
1295	err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
1296	if (err)
1297		goto out_free;
1298
1299	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1300
1301restart:
1302	if (!other) {
1303		err = -ECONNRESET;
1304		if (sunaddr == NULL)
1305			goto out_free;
1306
1307		other = unix_find_other(sunaddr, namelen, sk->sk_type,
1308					hash, &err);
1309		if (other==NULL)
1310			goto out_free;
1311	}
1312
1313	unix_state_rlock(other);
1314	err = -EPERM;
1315	if (!unix_may_send(sk, other))
1316		goto out_unlock;
1317
1318	if (sock_flag(other, SOCK_DEAD)) {
1319		/*
1320		 *	Check with 1003.1g - what should
1321		 *	datagram error
1322		 */
1323		unix_state_runlock(other);
1324		sock_put(other);
1325
1326		err = 0;
1327		unix_state_wlock(sk);
1328		if (unix_peer(sk) == other) {
1329			unix_peer(sk)=NULL;
1330			unix_state_wunlock(sk);
1331
1332			unix_dgram_disconnected(sk, other);
1333			sock_put(other);
1334			err = -ECONNREFUSED;
1335		} else {
1336			unix_state_wunlock(sk);
1337		}
1338
1339		other = NULL;
1340		if (err)
1341			goto out_free;
1342		goto restart;
1343	}
1344
1345	err = -EPIPE;
1346	if (other->sk_shutdown & RCV_SHUTDOWN)
1347		goto out_unlock;
1348
1349	if (sk->sk_type != SOCK_SEQPACKET) {
1350		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1351		if (err)
1352			goto out_unlock;
1353	}
1354
1355	if (unix_peer(other) != sk &&
1356	    (skb_queue_len(&other->sk_receive_queue) >
1357	     other->sk_max_ack_backlog)) {
1358		if (!timeo) {
1359			err = -EAGAIN;
1360			goto out_unlock;
1361		}
1362
1363		timeo = unix_wait_for_peer(other, timeo);
1364
1365		err = sock_intr_errno(timeo);
1366		if (signal_pending(current))
1367			goto out_free;
1368
1369		goto restart;
1370	}
1371
1372	skb_queue_tail(&other->sk_receive_queue, skb);
1373	unix_state_runlock(other);
1374	other->sk_data_ready(other, len);
1375	sock_put(other);
1376	scm_destroy(siocb->scm);
1377	return len;
1378
1379out_unlock:
1380	unix_state_runlock(other);
1381out_free:
1382	kfree_skb(skb);
1383out:
1384	if (other)
1385		sock_put(other);
1386	scm_destroy(siocb->scm);
1387	return err;
1388}
1389
1390
1391static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1392			       struct msghdr *msg, size_t len)
1393{
1394	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1395	struct sock *sk = sock->sk;
1396	struct sock *other = NULL;
1397	struct sockaddr_un *sunaddr=msg->msg_name;
1398	int err,size;
1399	struct sk_buff *skb;
1400	int sent=0;
1401	struct scm_cookie tmp_scm;
1402
1403	if (NULL == siocb->scm)
1404		siocb->scm = &tmp_scm;
1405	err = scm_send(sock, msg, siocb->scm);
1406	if (err < 0)
1407		return err;
1408
1409	err = -EOPNOTSUPP;
1410	if (msg->msg_flags&MSG_OOB)
1411		goto out_err;
1412
1413	if (msg->msg_namelen) {
1414		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1415		goto out_err;
1416	} else {
1417		sunaddr = NULL;
1418		err = -ENOTCONN;
1419		other = unix_peer(sk);
1420		if (!other)
1421			goto out_err;
1422	}
1423
1424	if (sk->sk_shutdown & SEND_SHUTDOWN)
1425		goto pipe_err;
1426
1427	while(sent < len)
1428	{
1429		/*
1430		 *	Optimisation for the fact that under 0.01% of X messages typically
1431		 *	need breaking up.
1432		 */
1433
1434		size=len-sent;
1435
1436		/* Keep two messages in the pipe so it schedules better */
1437		if (size > sk->sk_sndbuf / 2 - 64)
1438			size = sk->sk_sndbuf / 2 - 64;
1439
1440		if (size > SKB_MAX_ALLOC)
1441			size = SKB_MAX_ALLOC;
1442
1443		/*
1444		 *	Grab a buffer
1445		 */
1446
1447		skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err);
1448
1449		if (skb==NULL)
1450			goto out_err;
1451
1452		/*
1453		 *	If you pass two values to the sock_alloc_send_skb
1454		 *	it tries to grab the large buffer with GFP_NOFS
1455		 *	(which can fail easily), and if it fails grab the
1456		 *	fallback size buffer which is under a page and will
1457		 *	succeed. [Alan]
1458		 */
1459		size = min_t(int, size, skb_tailroom(skb));
1460
1461		memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1462		if (siocb->scm->fp)
1463			unix_attach_fds(siocb->scm, skb);
1464
1465		if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
1466			kfree_skb(skb);
1467			goto out_err;
1468		}
1469
1470		unix_state_rlock(other);
1471
1472		if (sock_flag(other, SOCK_DEAD) ||
1473		    (other->sk_shutdown & RCV_SHUTDOWN))
1474			goto pipe_err_free;
1475
1476		skb_queue_tail(&other->sk_receive_queue, skb);
1477		unix_state_runlock(other);
1478		other->sk_data_ready(other, size);
1479		sent+=size;
1480	}
1481
1482	scm_destroy(siocb->scm);
1483	siocb->scm = NULL;
1484
1485	return sent;
1486
1487pipe_err_free:
1488	unix_state_runlock(other);
1489	kfree_skb(skb);
1490pipe_err:
1491	if (sent==0 && !(msg->msg_flags&MSG_NOSIGNAL))
1492		send_sig(SIGPIPE,current,0);
1493	err = -EPIPE;
1494out_err:
1495	scm_destroy(siocb->scm);
1496	siocb->scm = NULL;
1497	return sent ? : err;
1498}
1499
1500static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1501				  struct msghdr *msg, size_t len)
1502{
1503	int err;
1504	struct sock *sk = sock->sk;
1505
1506	err = sock_error(sk);
1507	if (err)
1508		return err;
1509
1510	if (sk->sk_state != TCP_ESTABLISHED)
1511		return -ENOTCONN;
1512
1513	if (msg->msg_namelen)
1514		msg->msg_namelen = 0;
1515
1516	return unix_dgram_sendmsg(kiocb, sock, msg, len);
1517}
1518
1519static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1520{
1521	struct unix_sock *u = unix_sk(sk);
1522
1523	msg->msg_namelen = 0;
1524	if (u->addr) {
1525		msg->msg_namelen = u->addr->len;
1526		memcpy(msg->msg_name, u->addr->name, u->addr->len);
1527	}
1528}
1529
1530static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1531			      struct msghdr *msg, size_t size,
1532			      int flags)
1533{
1534	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1535	struct scm_cookie tmp_scm;
1536	struct sock *sk = sock->sk;
1537	struct unix_sock *u = unix_sk(sk);
1538	int noblock = flags & MSG_DONTWAIT;
1539	struct sk_buff *skb;
1540	int err;
1541
1542	err = -EOPNOTSUPP;
1543	if (flags&MSG_OOB)
1544		goto out;
1545
1546	msg->msg_namelen = 0;
1547
1548	down(&u->readsem);
1549
1550	skb = skb_recv_datagram(sk, flags, noblock, &err);
1551	if (!skb)
1552		goto out_unlock;
1553
1554	wake_up_interruptible(&u->peer_wait);
1555
1556	if (msg->msg_name)
1557		unix_copy_addr(msg, skb->sk);
1558
1559	if (size > skb->len)
1560		size = skb->len;
1561	else if (size < skb->len)
1562		msg->msg_flags |= MSG_TRUNC;
1563
1564	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size);
1565	if (err)
1566		goto out_free;
1567
1568	if (!siocb->scm) {
1569		siocb->scm = &tmp_scm;
1570		memset(&tmp_scm, 0, sizeof(tmp_scm));
1571	}
1572	siocb->scm->creds = *UNIXCREDS(skb);
1573
1574	if (!(flags & MSG_PEEK))
1575	{
1576		if (UNIXCB(skb).fp)
1577			unix_detach_fds(siocb->scm, skb);
1578	}
1579	else
1580	{
1581		/* It is questionable: on PEEK we could:
1582		   - do not return fds - good, but too simple 8)
1583		   - return fds, and do not return them on read (old strategy,
1584		     apparently wrong)
1585		   - clone fds (I chose it for now, it is the most universal
1586		     solution)
1587
1588	           POSIX 1003.1g does not actually define this clearly
1589	           at all. POSIX 1003.1g doesn't define a lot of things
1590	           clearly however!
1591
1592		*/
1593		if (UNIXCB(skb).fp)
1594			siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1595	}
1596	err = size;
1597
1598	scm_recv(sock, msg, siocb->scm, flags);
1599
1600out_free:
1601	skb_free_datagram(sk,skb);
1602out_unlock:
1603	up(&u->readsem);
1604out:
1605	return err;
1606}
1607
1608/*
1609 *	Sleep until data has arrive. But check for races..
1610 */
1611
1612static long unix_stream_data_wait(struct sock * sk, long timeo)
1613{
1614	DEFINE_WAIT(wait);
1615
1616	unix_state_rlock(sk);
1617
1618	for (;;) {
1619		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1620
1621		if (!skb_queue_empty(&sk->sk_receive_queue) ||
1622		    sk->sk_err ||
1623		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1624		    signal_pending(current) ||
1625		    !timeo)
1626			break;
1627
1628		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1629		unix_state_runlock(sk);
1630		timeo = schedule_timeout(timeo);
1631		unix_state_rlock(sk);
1632		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1633	}
1634
1635	finish_wait(sk->sk_sleep, &wait);
1636	unix_state_runlock(sk);
1637	return timeo;
1638}
1639
1640
1641
1642static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1643			       struct msghdr *msg, size_t size,
1644			       int flags)
1645{
1646	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1647	struct scm_cookie tmp_scm;
1648	struct sock *sk = sock->sk;
1649	struct unix_sock *u = unix_sk(sk);
1650	struct sockaddr_un *sunaddr=msg->msg_name;
1651	int copied = 0;
1652	int check_creds = 0;
1653	int target;
1654	int err = 0;
1655	long timeo;
1656
1657	err = -EINVAL;
1658	if (sk->sk_state != TCP_ESTABLISHED)
1659		goto out;
1660
1661	err = -EOPNOTSUPP;
1662	if (flags&MSG_OOB)
1663		goto out;
1664
1665	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1666	timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1667
1668	msg->msg_namelen = 0;
1669
1670	/* Lock the socket to prevent queue disordering
1671	 * while sleeps in memcpy_tomsg
1672	 */
1673
1674	if (!siocb->scm) {
1675		siocb->scm = &tmp_scm;
1676		memset(&tmp_scm, 0, sizeof(tmp_scm));
1677	}
1678
1679	down(&u->readsem);
1680
1681	do
1682	{
1683		int chunk;
1684		struct sk_buff *skb;
1685
1686		skb = skb_dequeue(&sk->sk_receive_queue);
1687		if (skb==NULL)
1688		{
1689			if (copied >= target)
1690				break;
1691
1692			/*
1693			 *	POSIX 1003.1g mandates this order.
1694			 */
1695
1696			if ((err = sock_error(sk)) != 0)
1697				break;
1698			if (sk->sk_shutdown & RCV_SHUTDOWN)
1699				break;
1700			err = -EAGAIN;
1701			if (!timeo)
1702				break;
1703			up(&u->readsem);
1704
1705			timeo = unix_stream_data_wait(sk, timeo);
1706
1707			if (signal_pending(current)) {
1708				err = sock_intr_errno(timeo);
1709				goto out;
1710			}
1711			down(&u->readsem);
1712			continue;
1713		}
1714
1715		if (check_creds) {
1716			/* Never glue messages from different writers */
1717			if (memcmp(UNIXCREDS(skb), &siocb->scm->creds, sizeof(siocb->scm->creds)) != 0) {
1718				skb_queue_head(&sk->sk_receive_queue, skb);
1719				break;
1720			}
1721		} else {
1722			/* Copy credentials */
1723			siocb->scm->creds = *UNIXCREDS(skb);
1724			check_creds = 1;
1725		}
1726
1727		/* Copy address just once */
1728		if (sunaddr)
1729		{
1730			unix_copy_addr(msg, skb->sk);
1731			sunaddr = NULL;
1732		}
1733
1734		chunk = min_t(unsigned int, skb->len, size);
1735		if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
1736			skb_queue_head(&sk->sk_receive_queue, skb);
1737			if (copied == 0)
1738				copied = -EFAULT;
1739			break;
1740		}
1741		copied += chunk;
1742		size -= chunk;
1743
1744		/* Mark read part of skb as used */
1745		if (!(flags & MSG_PEEK))
1746		{
1747			skb_pull(skb, chunk);
1748
1749			if (UNIXCB(skb).fp)
1750				unix_detach_fds(siocb->scm, skb);
1751
1752			/* put the skb back if we didn't use it up.. */
1753			if (skb->len)
1754			{
1755				skb_queue_head(&sk->sk_receive_queue, skb);
1756				break;
1757			}
1758
1759			kfree_skb(skb);
1760
1761			if (siocb->scm->fp)
1762				break;
1763		}
1764		else
1765		{
1766			/* It is questionable, see note in unix_dgram_recvmsg.
1767			 */
1768			if (UNIXCB(skb).fp)
1769				siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1770
1771			/* put message back and return */
1772			skb_queue_head(&sk->sk_receive_queue, skb);
1773			break;
1774		}
1775	} while (size);
1776
1777	up(&u->readsem);
1778	scm_recv(sock, msg, siocb->scm, flags);
1779out:
1780	return copied ? : err;
1781}
1782
1783static int unix_shutdown(struct socket *sock, int mode)
1784{
1785	struct sock *sk = sock->sk;
1786	struct sock *other;
1787
1788	mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
1789
1790	if (mode) {
1791		unix_state_wlock(sk);
1792		sk->sk_shutdown |= mode;
1793		other=unix_peer(sk);
1794		if (other)
1795			sock_hold(other);
1796		unix_state_wunlock(sk);
1797		sk->sk_state_change(sk);
1798
1799		if (other &&
1800			(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
1801
1802			int peer_mode = 0;
1803
1804			if (mode&RCV_SHUTDOWN)
1805				peer_mode |= SEND_SHUTDOWN;
1806			if (mode&SEND_SHUTDOWN)
1807				peer_mode |= RCV_SHUTDOWN;
1808			unix_state_wlock(other);
1809			other->sk_shutdown |= peer_mode;
1810			unix_state_wunlock(other);
1811			other->sk_state_change(other);
1812			read_lock(&other->sk_callback_lock);
1813			if (peer_mode == SHUTDOWN_MASK)
1814				sk_wake_async(other,1,POLL_HUP);
1815			else if (peer_mode & RCV_SHUTDOWN)
1816				sk_wake_async(other,1,POLL_IN);
1817			read_unlock(&other->sk_callback_lock);
1818		}
1819		if (other)
1820			sock_put(other);
1821	}
1822	return 0;
1823}
1824
1825static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1826{
1827	struct sock *sk = sock->sk;
1828	long amount=0;
1829	int err;
1830
1831	switch(cmd)
1832	{
1833		case SIOCOUTQ:
1834			amount = atomic_read(&sk->sk_wmem_alloc);
1835			err = put_user(amount, (int __user *)arg);
1836			break;
1837		case SIOCINQ:
1838		{
1839			struct sk_buff *skb;
1840
1841			if (sk->sk_state == TCP_LISTEN) {
1842				err = -EINVAL;
1843				break;
1844			}
1845
1846			spin_lock(&sk->sk_receive_queue.lock);
1847			if (sk->sk_type == SOCK_STREAM ||
1848			    sk->sk_type == SOCK_SEQPACKET) {
1849				skb_queue_walk(&sk->sk_receive_queue, skb)
1850					amount += skb->len;
1851			} else {
1852				skb = skb_peek(&sk->sk_receive_queue);
1853				if (skb)
1854					amount=skb->len;
1855			}
1856			spin_unlock(&sk->sk_receive_queue.lock);
1857			err = put_user(amount, (int __user *)arg);
1858			break;
1859		}
1860
1861		default:
1862			err = -ENOIOCTLCMD;
1863			break;
1864	}
1865	return err;
1866}
1867
1868static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait)
1869{
1870	struct sock *sk = sock->sk;
1871	unsigned int mask;
1872
1873	poll_wait(file, sk->sk_sleep, wait);
1874	mask = 0;
1875
1876	/* exceptional events? */
1877	if (sk->sk_err)
1878		mask |= POLLERR;
1879	if (sk->sk_shutdown == SHUTDOWN_MASK)
1880		mask |= POLLHUP;
1881
1882	/* readable? */
1883	if (!skb_queue_empty(&sk->sk_receive_queue) ||
1884	    (sk->sk_shutdown & RCV_SHUTDOWN))
1885		mask |= POLLIN | POLLRDNORM;
1886
1887	/* Connection-based need to check for termination and startup */
1888	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && sk->sk_state == TCP_CLOSE)
1889		mask |= POLLHUP;
1890
1891	/*
1892	 * we set writable also when the other side has shut down the
1893	 * connection. This prevents stuck sockets.
1894	 */
1895	if (unix_writable(sk))
1896		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
1897
1898	return mask;
1899}
1900
1901
1902#ifdef CONFIG_PROC_FS
1903static struct sock *unix_seq_idx(int *iter, loff_t pos)
1904{
1905	loff_t off = 0;
1906	struct sock *s;
1907
1908	for (s = first_unix_socket(iter); s; s = next_unix_socket(iter, s)) {
1909		if (off == pos)
1910			return s;
1911		++off;
1912	}
1913	return NULL;
1914}
1915
1916
1917static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
1918{
1919	spin_lock(&unix_table_lock);
1920	return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
1921}
1922
1923static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1924{
1925	++*pos;
1926
1927	if (v == (void *)1)
1928		return first_unix_socket(seq->private);
1929	return next_unix_socket(seq->private, v);
1930}
1931
1932static void unix_seq_stop(struct seq_file *seq, void *v)
1933{
1934	spin_unlock(&unix_table_lock);
1935}
1936
1937static int unix_seq_show(struct seq_file *seq, void *v)
1938{
1939
1940	if (v == (void *)1)
1941		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
1942			 "Inode Path\n");
1943	else {
1944		struct sock *s = v;
1945		struct unix_sock *u = unix_sk(s);
1946		unix_state_rlock(s);
1947
1948		seq_printf(seq, "%p: %08X %08X %08X %04X %02X %5lu",
1949			s,
1950			atomic_read(&s->sk_refcnt),
1951			0,
1952			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
1953			s->sk_type,
1954			s->sk_socket ?
1955			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
1956			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
1957			sock_i_ino(s));
1958
1959		if (u->addr) {
1960			int i, len;
1961			seq_putc(seq, ' ');
1962
1963			i = 0;
1964			len = u->addr->len - sizeof(short);
1965			if (!UNIX_ABSTRACT(s))
1966				len--;
1967			else {
1968				seq_putc(seq, '@');
1969				i++;
1970			}
1971			for ( ; i < len; i++)
1972				seq_putc(seq, u->addr->name->sun_path[i]);
1973		}
1974		unix_state_runlock(s);
1975		seq_putc(seq, '\n');
1976	}
1977
1978	return 0;
1979}
1980
1981static struct seq_operations unix_seq_ops = {
1982	.start  = unix_seq_start,
1983	.next   = unix_seq_next,
1984	.stop   = unix_seq_stop,
1985	.show   = unix_seq_show,
1986};
1987
1988
1989static int unix_seq_open(struct inode *inode, struct file *file)
1990{
1991	struct seq_file *seq;
1992	int rc = -ENOMEM;
1993	int *iter = kmalloc(sizeof(int), GFP_KERNEL);
1994
1995	if (!iter)
1996		goto out;
1997
1998	rc = seq_open(file, &unix_seq_ops);
1999	if (rc)
2000		goto out_kfree;
2001
2002	seq	     = file->private_data;
2003	seq->private = iter;
2004	*iter = 0;
2005out:
2006	return rc;
2007out_kfree:
2008	kfree(iter);
2009	goto out;
2010}
2011
2012static struct file_operations unix_seq_fops = {
2013	.owner		= THIS_MODULE,
2014	.open		= unix_seq_open,
2015	.read		= seq_read,
2016	.llseek		= seq_lseek,
2017	.release	= seq_release_private,
2018};
2019
2020#endif
2021
2022static struct net_proto_family unix_family_ops = {
2023	.family = PF_UNIX,
2024	.create = unix_create,
2025	.owner	= THIS_MODULE,
2026};
2027
2028static int __init af_unix_init(void)
2029{
2030	int rc = -1;
2031	struct sk_buff *dummy_skb;
2032
2033	if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) {
2034		printk(KERN_CRIT "%s: panic\n", __FUNCTION__);
2035		goto out;
2036	}
2037
2038	rc = proto_register(&unix_proto, 1);
2039        if (rc != 0) {
2040                printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2041		       __FUNCTION__);
2042		goto out;
2043	}
2044
2045	sock_register(&unix_family_ops);
2046#ifdef CONFIG_PROC_FS
2047	proc_net_fops_create("unix", 0, &unix_seq_fops);
2048#endif
2049	unix_sysctl_register();
2050out:
2051	return rc;
2052}
2053
2054static void __exit af_unix_exit(void)
2055{
2056	sock_unregister(PF_UNIX);
2057	unix_sysctl_unregister();
2058	proc_net_remove("unix");
2059	proto_unregister(&unix_proto);
2060}
2061
2062module_init(af_unix_init);
2063module_exit(af_unix_exit);
2064
2065MODULE_LICENSE("GPL");
2066MODULE_ALIAS_NETPROTO(PF_UNIX);
2067