af_unix.c revision f3d3342602f8bcbf37d7c46641cb9bca7618eb1c
1/*
2 * NET4:	Implementation of BSD Unix domain sockets.
3 *
4 * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
5 *
6 *		This program is free software; you can redistribute it and/or
7 *		modify it under the terms of the GNU General Public License
8 *		as published by the Free Software Foundation; either version
9 *		2 of the License, or (at your option) any later version.
10 *
11 * Fixes:
12 *		Linus Torvalds	:	Assorted bug cures.
13 *		Niibe Yutaka	:	async I/O support.
14 *		Carsten Paeth	:	PF_UNIX check, address fixes.
15 *		Alan Cox	:	Limit size of allocated blocks.
16 *		Alan Cox	:	Fixed the stupid socketpair bug.
17 *		Alan Cox	:	BSD compatibility fine tuning.
18 *		Alan Cox	:	Fixed a bug in connect when interrupted.
19 *		Alan Cox	:	Sorted out a proper draft version of
20 *					file descriptor passing hacked up from
21 *					Mike Shaver's work.
22 *		Marty Leisner	:	Fixes to fd passing
23 *		Nick Nevin	:	recvmsg bugfix.
24 *		Alan Cox	:	Started proper garbage collector
25 *		Heiko EiBfeldt	:	Missing verify_area check
26 *		Alan Cox	:	Started POSIXisms
27 *		Andreas Schwab	:	Replace inode by dentry for proper
28 *					reference counting
29 *		Kirk Petersen	:	Made this a module
30 *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
31 *					Lots of bug fixes.
32 *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
33 *					by above two patches.
34 *	     Andrea Arcangeli	:	If possible we block in connect(2)
35 *					if the max backlog of the listen socket
36 *					is been reached. This won't break
37 *					old apps and it will avoid huge amount
38 *					of socks hashed (this for unix_gc()
39 *					performances reasons).
40 *					Security fix that limits the max
41 *					number of socks to 2*max_files and
42 *					the number of skb queueable in the
43 *					dgram receiver.
44 *		Artur Skawina   :	Hash function optimizations
45 *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
46 *	      Malcolm Beattie   :	Set peercred for socketpair
47 *	     Michal Ostrowski   :       Module initialization cleanup.
48 *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
49 *	     				the core infrastructure is doing that
50 *	     				for all net proto families now (2.5.69+)
51 *
52 *
53 * Known differences from reference BSD that was tested:
54 *
55 *	[TO FIX]
56 *	ECONNREFUSED is not returned from one end of a connected() socket to the
57 *		other the moment one end closes.
58 *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
59 *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
60 *	[NOT TO FIX]
61 *	accept() returns a path name even if the connecting socket has closed
62 *		in the meantime (BSD loses the path and gives up).
63 *	accept() returns 0 length path for an unbound connector. BSD returns 16
64 *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
65 *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
66 *	BSD af_unix apparently has connect forgetting to block properly.
67 *		(need to check this with the POSIX spec in detail)
68 *
69 * Differences from 2.0.0-11-... (ANK)
70 *	Bug fixes and improvements.
71 *		- client shutdown killed server socket.
72 *		- removed all useless cli/sti pairs.
73 *
74 *	Semantic changes/extensions.
75 *		- generic control message passing.
76 *		- SCM_CREDENTIALS control message.
77 *		- "Abstract" (not FS based) socket bindings.
78 *		  Abstract names are sequences of bytes (not zero terminated)
79 *		  started by 0, so that this name space does not intersect
80 *		  with BSD names.
81 */
82
83#include <linux/module.h>
84#include <linux/kernel.h>
85#include <linux/signal.h>
86#include <linux/sched.h>
87#include <linux/errno.h>
88#include <linux/string.h>
89#include <linux/stat.h>
90#include <linux/dcache.h>
91#include <linux/namei.h>
92#include <linux/socket.h>
93#include <linux/un.h>
94#include <linux/fcntl.h>
95#include <linux/termios.h>
96#include <linux/sockios.h>
97#include <linux/net.h>
98#include <linux/in.h>
99#include <linux/fs.h>
100#include <linux/slab.h>
101#include <asm/uaccess.h>
102#include <linux/skbuff.h>
103#include <linux/netdevice.h>
104#include <net/net_namespace.h>
105#include <net/sock.h>
106#include <net/tcp_states.h>
107#include <net/af_unix.h>
108#include <linux/proc_fs.h>
109#include <linux/seq_file.h>
110#include <net/scm.h>
111#include <linux/init.h>
112#include <linux/poll.h>
113#include <linux/rtnetlink.h>
114#include <linux/mount.h>
115#include <net/checksum.h>
116#include <linux/security.h>
117#include <linux/freezer.h>
118
119struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
120EXPORT_SYMBOL_GPL(unix_socket_table);
121DEFINE_SPINLOCK(unix_table_lock);
122EXPORT_SYMBOL_GPL(unix_table_lock);
123static atomic_long_t unix_nr_socks;
124
125
126static struct hlist_head *unix_sockets_unbound(void *addr)
127{
128	unsigned long hash = (unsigned long)addr;
129
130	hash ^= hash >> 16;
131	hash ^= hash >> 8;
132	hash %= UNIX_HASH_SIZE;
133	return &unix_socket_table[UNIX_HASH_SIZE + hash];
134}
135
136#define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
137
138#ifdef CONFIG_SECURITY_NETWORK
139static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
140{
141	memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
142}
143
144static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
145{
146	scm->secid = *UNIXSID(skb);
147}
148#else
149static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
150{ }
151
152static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
153{ }
154#endif /* CONFIG_SECURITY_NETWORK */
155
156/*
157 *  SMP locking strategy:
158 *    hash table is protected with spinlock unix_table_lock
159 *    each socket state is protected by separate spin lock.
160 */
161
162static inline unsigned int unix_hash_fold(__wsum n)
163{
164	unsigned int hash = (__force unsigned int)n;
165
166	hash ^= hash>>16;
167	hash ^= hash>>8;
168	return hash&(UNIX_HASH_SIZE-1);
169}
170
171#define unix_peer(sk) (unix_sk(sk)->peer)
172
173static inline int unix_our_peer(struct sock *sk, struct sock *osk)
174{
175	return unix_peer(osk) == sk;
176}
177
178static inline int unix_may_send(struct sock *sk, struct sock *osk)
179{
180	return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
181}
182
183static inline int unix_recvq_full(struct sock const *sk)
184{
185	return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
186}
187
188struct sock *unix_peer_get(struct sock *s)
189{
190	struct sock *peer;
191
192	unix_state_lock(s);
193	peer = unix_peer(s);
194	if (peer)
195		sock_hold(peer);
196	unix_state_unlock(s);
197	return peer;
198}
199EXPORT_SYMBOL_GPL(unix_peer_get);
200
201static inline void unix_release_addr(struct unix_address *addr)
202{
203	if (atomic_dec_and_test(&addr->refcnt))
204		kfree(addr);
205}
206
207/*
208 *	Check unix socket name:
209 *		- should be not zero length.
210 *	        - if started by not zero, should be NULL terminated (FS object)
211 *		- if started by zero, it is abstract name.
212 */
213
214static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
215{
216	if (len <= sizeof(short) || len > sizeof(*sunaddr))
217		return -EINVAL;
218	if (!sunaddr || sunaddr->sun_family != AF_UNIX)
219		return -EINVAL;
220	if (sunaddr->sun_path[0]) {
221		/*
222		 * This may look like an off by one error but it is a bit more
223		 * subtle. 108 is the longest valid AF_UNIX path for a binding.
224		 * sun_path[108] doesn't as such exist.  However in kernel space
225		 * we are guaranteed that it is a valid memory location in our
226		 * kernel address buffer.
227		 */
228		((char *)sunaddr)[len] = 0;
229		len = strlen(sunaddr->sun_path)+1+sizeof(short);
230		return len;
231	}
232
233	*hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
234	return len;
235}
236
237static void __unix_remove_socket(struct sock *sk)
238{
239	sk_del_node_init(sk);
240}
241
242static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
243{
244	WARN_ON(!sk_unhashed(sk));
245	sk_add_node(sk, list);
246}
247
248static inline void unix_remove_socket(struct sock *sk)
249{
250	spin_lock(&unix_table_lock);
251	__unix_remove_socket(sk);
252	spin_unlock(&unix_table_lock);
253}
254
255static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
256{
257	spin_lock(&unix_table_lock);
258	__unix_insert_socket(list, sk);
259	spin_unlock(&unix_table_lock);
260}
261
262static struct sock *__unix_find_socket_byname(struct net *net,
263					      struct sockaddr_un *sunname,
264					      int len, int type, unsigned int hash)
265{
266	struct sock *s;
267
268	sk_for_each(s, &unix_socket_table[hash ^ type]) {
269		struct unix_sock *u = unix_sk(s);
270
271		if (!net_eq(sock_net(s), net))
272			continue;
273
274		if (u->addr->len == len &&
275		    !memcmp(u->addr->name, sunname, len))
276			goto found;
277	}
278	s = NULL;
279found:
280	return s;
281}
282
283static inline struct sock *unix_find_socket_byname(struct net *net,
284						   struct sockaddr_un *sunname,
285						   int len, int type,
286						   unsigned int hash)
287{
288	struct sock *s;
289
290	spin_lock(&unix_table_lock);
291	s = __unix_find_socket_byname(net, sunname, len, type, hash);
292	if (s)
293		sock_hold(s);
294	spin_unlock(&unix_table_lock);
295	return s;
296}
297
298static struct sock *unix_find_socket_byinode(struct inode *i)
299{
300	struct sock *s;
301
302	spin_lock(&unix_table_lock);
303	sk_for_each(s,
304		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
305		struct dentry *dentry = unix_sk(s)->path.dentry;
306
307		if (dentry && dentry->d_inode == i) {
308			sock_hold(s);
309			goto found;
310		}
311	}
312	s = NULL;
313found:
314	spin_unlock(&unix_table_lock);
315	return s;
316}
317
318static inline int unix_writable(struct sock *sk)
319{
320	return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
321}
322
323static void unix_write_space(struct sock *sk)
324{
325	struct socket_wq *wq;
326
327	rcu_read_lock();
328	if (unix_writable(sk)) {
329		wq = rcu_dereference(sk->sk_wq);
330		if (wq_has_sleeper(wq))
331			wake_up_interruptible_sync_poll(&wq->wait,
332				POLLOUT | POLLWRNORM | POLLWRBAND);
333		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
334	}
335	rcu_read_unlock();
336}
337
338/* When dgram socket disconnects (or changes its peer), we clear its receive
339 * queue of packets arrived from previous peer. First, it allows to do
340 * flow control based only on wmem_alloc; second, sk connected to peer
341 * may receive messages only from that peer. */
342static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
343{
344	if (!skb_queue_empty(&sk->sk_receive_queue)) {
345		skb_queue_purge(&sk->sk_receive_queue);
346		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
347
348		/* If one link of bidirectional dgram pipe is disconnected,
349		 * we signal error. Messages are lost. Do not make this,
350		 * when peer was not connected to us.
351		 */
352		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
353			other->sk_err = ECONNRESET;
354			other->sk_error_report(other);
355		}
356	}
357}
358
359static void unix_sock_destructor(struct sock *sk)
360{
361	struct unix_sock *u = unix_sk(sk);
362
363	skb_queue_purge(&sk->sk_receive_queue);
364
365	WARN_ON(atomic_read(&sk->sk_wmem_alloc));
366	WARN_ON(!sk_unhashed(sk));
367	WARN_ON(sk->sk_socket);
368	if (!sock_flag(sk, SOCK_DEAD)) {
369		printk(KERN_INFO "Attempt to release alive unix socket: %p\n", sk);
370		return;
371	}
372
373	if (u->addr)
374		unix_release_addr(u->addr);
375
376	atomic_long_dec(&unix_nr_socks);
377	local_bh_disable();
378	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
379	local_bh_enable();
380#ifdef UNIX_REFCNT_DEBUG
381	printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
382		atomic_long_read(&unix_nr_socks));
383#endif
384}
385
386static void unix_release_sock(struct sock *sk, int embrion)
387{
388	struct unix_sock *u = unix_sk(sk);
389	struct path path;
390	struct sock *skpair;
391	struct sk_buff *skb;
392	int state;
393
394	unix_remove_socket(sk);
395
396	/* Clear state */
397	unix_state_lock(sk);
398	sock_orphan(sk);
399	sk->sk_shutdown = SHUTDOWN_MASK;
400	path	     = u->path;
401	u->path.dentry = NULL;
402	u->path.mnt = NULL;
403	state = sk->sk_state;
404	sk->sk_state = TCP_CLOSE;
405	unix_state_unlock(sk);
406
407	wake_up_interruptible_all(&u->peer_wait);
408
409	skpair = unix_peer(sk);
410
411	if (skpair != NULL) {
412		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
413			unix_state_lock(skpair);
414			/* No more writes */
415			skpair->sk_shutdown = SHUTDOWN_MASK;
416			if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
417				skpair->sk_err = ECONNRESET;
418			unix_state_unlock(skpair);
419			skpair->sk_state_change(skpair);
420			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
421		}
422		sock_put(skpair); /* It may now die */
423		unix_peer(sk) = NULL;
424	}
425
426	/* Try to flush out this socket. Throw out buffers at least */
427
428	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
429		if (state == TCP_LISTEN)
430			unix_release_sock(skb->sk, 1);
431		/* passed fds are erased in the kfree_skb hook	      */
432		kfree_skb(skb);
433	}
434
435	if (path.dentry)
436		path_put(&path);
437
438	sock_put(sk);
439
440	/* ---- Socket is dead now and most probably destroyed ---- */
441
442	/*
443	 * Fixme: BSD difference: In BSD all sockets connected to us get
444	 *	  ECONNRESET and we die on the spot. In Linux we behave
445	 *	  like files and pipes do and wait for the last
446	 *	  dereference.
447	 *
448	 * Can't we simply set sock->err?
449	 *
450	 *	  What the above comment does talk about? --ANK(980817)
451	 */
452
453	if (unix_tot_inflight)
454		unix_gc();		/* Garbage collect fds */
455}
456
457static void init_peercred(struct sock *sk)
458{
459	put_pid(sk->sk_peer_pid);
460	if (sk->sk_peer_cred)
461		put_cred(sk->sk_peer_cred);
462	sk->sk_peer_pid  = get_pid(task_tgid(current));
463	sk->sk_peer_cred = get_current_cred();
464}
465
466static void copy_peercred(struct sock *sk, struct sock *peersk)
467{
468	put_pid(sk->sk_peer_pid);
469	if (sk->sk_peer_cred)
470		put_cred(sk->sk_peer_cred);
471	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
472	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
473}
474
475static int unix_listen(struct socket *sock, int backlog)
476{
477	int err;
478	struct sock *sk = sock->sk;
479	struct unix_sock *u = unix_sk(sk);
480	struct pid *old_pid = NULL;
481
482	err = -EOPNOTSUPP;
483	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
484		goto out;	/* Only stream/seqpacket sockets accept */
485	err = -EINVAL;
486	if (!u->addr)
487		goto out;	/* No listens on an unbound socket */
488	unix_state_lock(sk);
489	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
490		goto out_unlock;
491	if (backlog > sk->sk_max_ack_backlog)
492		wake_up_interruptible_all(&u->peer_wait);
493	sk->sk_max_ack_backlog	= backlog;
494	sk->sk_state		= TCP_LISTEN;
495	/* set credentials so connect can copy them */
496	init_peercred(sk);
497	err = 0;
498
499out_unlock:
500	unix_state_unlock(sk);
501	put_pid(old_pid);
502out:
503	return err;
504}
505
506static int unix_release(struct socket *);
507static int unix_bind(struct socket *, struct sockaddr *, int);
508static int unix_stream_connect(struct socket *, struct sockaddr *,
509			       int addr_len, int flags);
510static int unix_socketpair(struct socket *, struct socket *);
511static int unix_accept(struct socket *, struct socket *, int);
512static int unix_getname(struct socket *, struct sockaddr *, int *, int);
513static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
514static unsigned int unix_dgram_poll(struct file *, struct socket *,
515				    poll_table *);
516static int unix_ioctl(struct socket *, unsigned int, unsigned long);
517static int unix_shutdown(struct socket *, int);
518static int unix_stream_sendmsg(struct kiocb *, struct socket *,
519			       struct msghdr *, size_t);
520static int unix_stream_recvmsg(struct kiocb *, struct socket *,
521			       struct msghdr *, size_t, int);
522static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
523			      struct msghdr *, size_t);
524static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
525			      struct msghdr *, size_t, int);
526static int unix_dgram_connect(struct socket *, struct sockaddr *,
527			      int, int);
528static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
529				  struct msghdr *, size_t);
530static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
531				  struct msghdr *, size_t, int);
532
533static void unix_set_peek_off(struct sock *sk, int val)
534{
535	struct unix_sock *u = unix_sk(sk);
536
537	mutex_lock(&u->readlock);
538	sk->sk_peek_off = val;
539	mutex_unlock(&u->readlock);
540}
541
542
543static const struct proto_ops unix_stream_ops = {
544	.family =	PF_UNIX,
545	.owner =	THIS_MODULE,
546	.release =	unix_release,
547	.bind =		unix_bind,
548	.connect =	unix_stream_connect,
549	.socketpair =	unix_socketpair,
550	.accept =	unix_accept,
551	.getname =	unix_getname,
552	.poll =		unix_poll,
553	.ioctl =	unix_ioctl,
554	.listen =	unix_listen,
555	.shutdown =	unix_shutdown,
556	.setsockopt =	sock_no_setsockopt,
557	.getsockopt =	sock_no_getsockopt,
558	.sendmsg =	unix_stream_sendmsg,
559	.recvmsg =	unix_stream_recvmsg,
560	.mmap =		sock_no_mmap,
561	.sendpage =	sock_no_sendpage,
562	.set_peek_off =	unix_set_peek_off,
563};
564
565static const struct proto_ops unix_dgram_ops = {
566	.family =	PF_UNIX,
567	.owner =	THIS_MODULE,
568	.release =	unix_release,
569	.bind =		unix_bind,
570	.connect =	unix_dgram_connect,
571	.socketpair =	unix_socketpair,
572	.accept =	sock_no_accept,
573	.getname =	unix_getname,
574	.poll =		unix_dgram_poll,
575	.ioctl =	unix_ioctl,
576	.listen =	sock_no_listen,
577	.shutdown =	unix_shutdown,
578	.setsockopt =	sock_no_setsockopt,
579	.getsockopt =	sock_no_getsockopt,
580	.sendmsg =	unix_dgram_sendmsg,
581	.recvmsg =	unix_dgram_recvmsg,
582	.mmap =		sock_no_mmap,
583	.sendpage =	sock_no_sendpage,
584	.set_peek_off =	unix_set_peek_off,
585};
586
587static const struct proto_ops unix_seqpacket_ops = {
588	.family =	PF_UNIX,
589	.owner =	THIS_MODULE,
590	.release =	unix_release,
591	.bind =		unix_bind,
592	.connect =	unix_stream_connect,
593	.socketpair =	unix_socketpair,
594	.accept =	unix_accept,
595	.getname =	unix_getname,
596	.poll =		unix_dgram_poll,
597	.ioctl =	unix_ioctl,
598	.listen =	unix_listen,
599	.shutdown =	unix_shutdown,
600	.setsockopt =	sock_no_setsockopt,
601	.getsockopt =	sock_no_getsockopt,
602	.sendmsg =	unix_seqpacket_sendmsg,
603	.recvmsg =	unix_seqpacket_recvmsg,
604	.mmap =		sock_no_mmap,
605	.sendpage =	sock_no_sendpage,
606	.set_peek_off =	unix_set_peek_off,
607};
608
609static struct proto unix_proto = {
610	.name			= "UNIX",
611	.owner			= THIS_MODULE,
612	.obj_size		= sizeof(struct unix_sock),
613};
614
615/*
616 * AF_UNIX sockets do not interact with hardware, hence they
617 * dont trigger interrupts - so it's safe for them to have
618 * bh-unsafe locking for their sk_receive_queue.lock. Split off
619 * this special lock-class by reinitializing the spinlock key:
620 */
621static struct lock_class_key af_unix_sk_receive_queue_lock_key;
622
623static struct sock *unix_create1(struct net *net, struct socket *sock)
624{
625	struct sock *sk = NULL;
626	struct unix_sock *u;
627
628	atomic_long_inc(&unix_nr_socks);
629	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
630		goto out;
631
632	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
633	if (!sk)
634		goto out;
635
636	sock_init_data(sock, sk);
637	lockdep_set_class(&sk->sk_receive_queue.lock,
638				&af_unix_sk_receive_queue_lock_key);
639
640	sk->sk_write_space	= unix_write_space;
641	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
642	sk->sk_destruct		= unix_sock_destructor;
643	u	  = unix_sk(sk);
644	u->path.dentry = NULL;
645	u->path.mnt = NULL;
646	spin_lock_init(&u->lock);
647	atomic_long_set(&u->inflight, 0);
648	INIT_LIST_HEAD(&u->link);
649	mutex_init(&u->readlock); /* single task reading lock */
650	init_waitqueue_head(&u->peer_wait);
651	unix_insert_socket(unix_sockets_unbound(sk), sk);
652out:
653	if (sk == NULL)
654		atomic_long_dec(&unix_nr_socks);
655	else {
656		local_bh_disable();
657		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
658		local_bh_enable();
659	}
660	return sk;
661}
662
663static int unix_create(struct net *net, struct socket *sock, int protocol,
664		       int kern)
665{
666	if (protocol && protocol != PF_UNIX)
667		return -EPROTONOSUPPORT;
668
669	sock->state = SS_UNCONNECTED;
670
671	switch (sock->type) {
672	case SOCK_STREAM:
673		sock->ops = &unix_stream_ops;
674		break;
675		/*
676		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
677		 *	nothing uses it.
678		 */
679	case SOCK_RAW:
680		sock->type = SOCK_DGRAM;
681	case SOCK_DGRAM:
682		sock->ops = &unix_dgram_ops;
683		break;
684	case SOCK_SEQPACKET:
685		sock->ops = &unix_seqpacket_ops;
686		break;
687	default:
688		return -ESOCKTNOSUPPORT;
689	}
690
691	return unix_create1(net, sock) ? 0 : -ENOMEM;
692}
693
694static int unix_release(struct socket *sock)
695{
696	struct sock *sk = sock->sk;
697
698	if (!sk)
699		return 0;
700
701	unix_release_sock(sk, 0);
702	sock->sk = NULL;
703
704	return 0;
705}
706
707static int unix_autobind(struct socket *sock)
708{
709	struct sock *sk = sock->sk;
710	struct net *net = sock_net(sk);
711	struct unix_sock *u = unix_sk(sk);
712	static u32 ordernum = 1;
713	struct unix_address *addr;
714	int err;
715	unsigned int retries = 0;
716
717	mutex_lock(&u->readlock);
718
719	err = 0;
720	if (u->addr)
721		goto out;
722
723	err = -ENOMEM;
724	addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
725	if (!addr)
726		goto out;
727
728	addr->name->sun_family = AF_UNIX;
729	atomic_set(&addr->refcnt, 1);
730
731retry:
732	addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
733	addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
734
735	spin_lock(&unix_table_lock);
736	ordernum = (ordernum+1)&0xFFFFF;
737
738	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
739				      addr->hash)) {
740		spin_unlock(&unix_table_lock);
741		/*
742		 * __unix_find_socket_byname() may take long time if many names
743		 * are already in use.
744		 */
745		cond_resched();
746		/* Give up if all names seems to be in use. */
747		if (retries++ == 0xFFFFF) {
748			err = -ENOSPC;
749			kfree(addr);
750			goto out;
751		}
752		goto retry;
753	}
754	addr->hash ^= sk->sk_type;
755
756	__unix_remove_socket(sk);
757	u->addr = addr;
758	__unix_insert_socket(&unix_socket_table[addr->hash], sk);
759	spin_unlock(&unix_table_lock);
760	err = 0;
761
762out:	mutex_unlock(&u->readlock);
763	return err;
764}
765
766static struct sock *unix_find_other(struct net *net,
767				    struct sockaddr_un *sunname, int len,
768				    int type, unsigned int hash, int *error)
769{
770	struct sock *u;
771	struct path path;
772	int err = 0;
773
774	if (sunname->sun_path[0]) {
775		struct inode *inode;
776		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
777		if (err)
778			goto fail;
779		inode = path.dentry->d_inode;
780		err = inode_permission(inode, MAY_WRITE);
781		if (err)
782			goto put_fail;
783
784		err = -ECONNREFUSED;
785		if (!S_ISSOCK(inode->i_mode))
786			goto put_fail;
787		u = unix_find_socket_byinode(inode);
788		if (!u)
789			goto put_fail;
790
791		if (u->sk_type == type)
792			touch_atime(&path);
793
794		path_put(&path);
795
796		err = -EPROTOTYPE;
797		if (u->sk_type != type) {
798			sock_put(u);
799			goto fail;
800		}
801	} else {
802		err = -ECONNREFUSED;
803		u = unix_find_socket_byname(net, sunname, len, type, hash);
804		if (u) {
805			struct dentry *dentry;
806			dentry = unix_sk(u)->path.dentry;
807			if (dentry)
808				touch_atime(&unix_sk(u)->path);
809		} else
810			goto fail;
811	}
812	return u;
813
814put_fail:
815	path_put(&path);
816fail:
817	*error = err;
818	return NULL;
819}
820
821static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
822{
823	struct dentry *dentry;
824	struct path path;
825	int err = 0;
826	/*
827	 * Get the parent directory, calculate the hash for last
828	 * component.
829	 */
830	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
831	err = PTR_ERR(dentry);
832	if (IS_ERR(dentry))
833		return err;
834
835	/*
836	 * All right, let's create it.
837	 */
838	err = security_path_mknod(&path, dentry, mode, 0);
839	if (!err) {
840		err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
841		if (!err) {
842			res->mnt = mntget(path.mnt);
843			res->dentry = dget(dentry);
844		}
845	}
846	done_path_create(&path, dentry);
847	return err;
848}
849
850static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
851{
852	struct sock *sk = sock->sk;
853	struct net *net = sock_net(sk);
854	struct unix_sock *u = unix_sk(sk);
855	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
856	char *sun_path = sunaddr->sun_path;
857	int err;
858	unsigned int hash;
859	struct unix_address *addr;
860	struct hlist_head *list;
861
862	err = -EINVAL;
863	if (sunaddr->sun_family != AF_UNIX)
864		goto out;
865
866	if (addr_len == sizeof(short)) {
867		err = unix_autobind(sock);
868		goto out;
869	}
870
871	err = unix_mkname(sunaddr, addr_len, &hash);
872	if (err < 0)
873		goto out;
874	addr_len = err;
875
876	mutex_lock(&u->readlock);
877
878	err = -EINVAL;
879	if (u->addr)
880		goto out_up;
881
882	err = -ENOMEM;
883	addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
884	if (!addr)
885		goto out_up;
886
887	memcpy(addr->name, sunaddr, addr_len);
888	addr->len = addr_len;
889	addr->hash = hash ^ sk->sk_type;
890	atomic_set(&addr->refcnt, 1);
891
892	if (sun_path[0]) {
893		struct path path;
894		umode_t mode = S_IFSOCK |
895		       (SOCK_INODE(sock)->i_mode & ~current_umask());
896		err = unix_mknod(sun_path, mode, &path);
897		if (err) {
898			if (err == -EEXIST)
899				err = -EADDRINUSE;
900			unix_release_addr(addr);
901			goto out_up;
902		}
903		addr->hash = UNIX_HASH_SIZE;
904		hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
905		spin_lock(&unix_table_lock);
906		u->path = path;
907		list = &unix_socket_table[hash];
908	} else {
909		spin_lock(&unix_table_lock);
910		err = -EADDRINUSE;
911		if (__unix_find_socket_byname(net, sunaddr, addr_len,
912					      sk->sk_type, hash)) {
913			unix_release_addr(addr);
914			goto out_unlock;
915		}
916
917		list = &unix_socket_table[addr->hash];
918	}
919
920	err = 0;
921	__unix_remove_socket(sk);
922	u->addr = addr;
923	__unix_insert_socket(list, sk);
924
925out_unlock:
926	spin_unlock(&unix_table_lock);
927out_up:
928	mutex_unlock(&u->readlock);
929out:
930	return err;
931}
932
933static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
934{
935	if (unlikely(sk1 == sk2) || !sk2) {
936		unix_state_lock(sk1);
937		return;
938	}
939	if (sk1 < sk2) {
940		unix_state_lock(sk1);
941		unix_state_lock_nested(sk2);
942	} else {
943		unix_state_lock(sk2);
944		unix_state_lock_nested(sk1);
945	}
946}
947
948static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
949{
950	if (unlikely(sk1 == sk2) || !sk2) {
951		unix_state_unlock(sk1);
952		return;
953	}
954	unix_state_unlock(sk1);
955	unix_state_unlock(sk2);
956}
957
958static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
959			      int alen, int flags)
960{
961	struct sock *sk = sock->sk;
962	struct net *net = sock_net(sk);
963	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
964	struct sock *other;
965	unsigned int hash;
966	int err;
967
968	if (addr->sa_family != AF_UNSPEC) {
969		err = unix_mkname(sunaddr, alen, &hash);
970		if (err < 0)
971			goto out;
972		alen = err;
973
974		if (test_bit(SOCK_PASSCRED, &sock->flags) &&
975		    !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
976			goto out;
977
978restart:
979		other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
980		if (!other)
981			goto out;
982
983		unix_state_double_lock(sk, other);
984
985		/* Apparently VFS overslept socket death. Retry. */
986		if (sock_flag(other, SOCK_DEAD)) {
987			unix_state_double_unlock(sk, other);
988			sock_put(other);
989			goto restart;
990		}
991
992		err = -EPERM;
993		if (!unix_may_send(sk, other))
994			goto out_unlock;
995
996		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
997		if (err)
998			goto out_unlock;
999
1000	} else {
1001		/*
1002		 *	1003.1g breaking connected state with AF_UNSPEC
1003		 */
1004		other = NULL;
1005		unix_state_double_lock(sk, other);
1006	}
1007
1008	/*
1009	 * If it was connected, reconnect.
1010	 */
1011	if (unix_peer(sk)) {
1012		struct sock *old_peer = unix_peer(sk);
1013		unix_peer(sk) = other;
1014		unix_state_double_unlock(sk, other);
1015
1016		if (other != old_peer)
1017			unix_dgram_disconnected(sk, old_peer);
1018		sock_put(old_peer);
1019	} else {
1020		unix_peer(sk) = other;
1021		unix_state_double_unlock(sk, other);
1022	}
1023	return 0;
1024
1025out_unlock:
1026	unix_state_double_unlock(sk, other);
1027	sock_put(other);
1028out:
1029	return err;
1030}
1031
1032static long unix_wait_for_peer(struct sock *other, long timeo)
1033{
1034	struct unix_sock *u = unix_sk(other);
1035	int sched;
1036	DEFINE_WAIT(wait);
1037
1038	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1039
1040	sched = !sock_flag(other, SOCK_DEAD) &&
1041		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1042		unix_recvq_full(other);
1043
1044	unix_state_unlock(other);
1045
1046	if (sched)
1047		timeo = schedule_timeout(timeo);
1048
1049	finish_wait(&u->peer_wait, &wait);
1050	return timeo;
1051}
1052
1053static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1054			       int addr_len, int flags)
1055{
1056	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1057	struct sock *sk = sock->sk;
1058	struct net *net = sock_net(sk);
1059	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1060	struct sock *newsk = NULL;
1061	struct sock *other = NULL;
1062	struct sk_buff *skb = NULL;
1063	unsigned int hash;
1064	int st;
1065	int err;
1066	long timeo;
1067
1068	err = unix_mkname(sunaddr, addr_len, &hash);
1069	if (err < 0)
1070		goto out;
1071	addr_len = err;
1072
1073	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1074	    (err = unix_autobind(sock)) != 0)
1075		goto out;
1076
1077	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1078
1079	/* First of all allocate resources.
1080	   If we will make it after state is locked,
1081	   we will have to recheck all again in any case.
1082	 */
1083
1084	err = -ENOMEM;
1085
1086	/* create new sock for complete connection */
1087	newsk = unix_create1(sock_net(sk), NULL);
1088	if (newsk == NULL)
1089		goto out;
1090
1091	/* Allocate skb for sending to listening sock */
1092	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1093	if (skb == NULL)
1094		goto out;
1095
1096restart:
1097	/*  Find listening sock. */
1098	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1099	if (!other)
1100		goto out;
1101
1102	/* Latch state of peer */
1103	unix_state_lock(other);
1104
1105	/* Apparently VFS overslept socket death. Retry. */
1106	if (sock_flag(other, SOCK_DEAD)) {
1107		unix_state_unlock(other);
1108		sock_put(other);
1109		goto restart;
1110	}
1111
1112	err = -ECONNREFUSED;
1113	if (other->sk_state != TCP_LISTEN)
1114		goto out_unlock;
1115	if (other->sk_shutdown & RCV_SHUTDOWN)
1116		goto out_unlock;
1117
1118	if (unix_recvq_full(other)) {
1119		err = -EAGAIN;
1120		if (!timeo)
1121			goto out_unlock;
1122
1123		timeo = unix_wait_for_peer(other, timeo);
1124
1125		err = sock_intr_errno(timeo);
1126		if (signal_pending(current))
1127			goto out;
1128		sock_put(other);
1129		goto restart;
1130	}
1131
1132	/* Latch our state.
1133
1134	   It is tricky place. We need to grab our state lock and cannot
1135	   drop lock on peer. It is dangerous because deadlock is
1136	   possible. Connect to self case and simultaneous
1137	   attempt to connect are eliminated by checking socket
1138	   state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1139	   check this before attempt to grab lock.
1140
1141	   Well, and we have to recheck the state after socket locked.
1142	 */
1143	st = sk->sk_state;
1144
1145	switch (st) {
1146	case TCP_CLOSE:
1147		/* This is ok... continue with connect */
1148		break;
1149	case TCP_ESTABLISHED:
1150		/* Socket is already connected */
1151		err = -EISCONN;
1152		goto out_unlock;
1153	default:
1154		err = -EINVAL;
1155		goto out_unlock;
1156	}
1157
1158	unix_state_lock_nested(sk);
1159
1160	if (sk->sk_state != st) {
1161		unix_state_unlock(sk);
1162		unix_state_unlock(other);
1163		sock_put(other);
1164		goto restart;
1165	}
1166
1167	err = security_unix_stream_connect(sk, other, newsk);
1168	if (err) {
1169		unix_state_unlock(sk);
1170		goto out_unlock;
1171	}
1172
1173	/* The way is open! Fastly set all the necessary fields... */
1174
1175	sock_hold(sk);
1176	unix_peer(newsk)	= sk;
1177	newsk->sk_state		= TCP_ESTABLISHED;
1178	newsk->sk_type		= sk->sk_type;
1179	init_peercred(newsk);
1180	newu = unix_sk(newsk);
1181	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1182	otheru = unix_sk(other);
1183
1184	/* copy address information from listening to new sock*/
1185	if (otheru->addr) {
1186		atomic_inc(&otheru->addr->refcnt);
1187		newu->addr = otheru->addr;
1188	}
1189	if (otheru->path.dentry) {
1190		path_get(&otheru->path);
1191		newu->path = otheru->path;
1192	}
1193
1194	/* Set credentials */
1195	copy_peercred(sk, other);
1196
1197	sock->state	= SS_CONNECTED;
1198	sk->sk_state	= TCP_ESTABLISHED;
1199	sock_hold(newsk);
1200
1201	smp_mb__after_atomic_inc();	/* sock_hold() does an atomic_inc() */
1202	unix_peer(sk)	= newsk;
1203
1204	unix_state_unlock(sk);
1205
1206	/* take ten and and send info to listening sock */
1207	spin_lock(&other->sk_receive_queue.lock);
1208	__skb_queue_tail(&other->sk_receive_queue, skb);
1209	spin_unlock(&other->sk_receive_queue.lock);
1210	unix_state_unlock(other);
1211	other->sk_data_ready(other, 0);
1212	sock_put(other);
1213	return 0;
1214
1215out_unlock:
1216	if (other)
1217		unix_state_unlock(other);
1218
1219out:
1220	kfree_skb(skb);
1221	if (newsk)
1222		unix_release_sock(newsk, 0);
1223	if (other)
1224		sock_put(other);
1225	return err;
1226}
1227
1228static int unix_socketpair(struct socket *socka, struct socket *sockb)
1229{
1230	struct sock *ska = socka->sk, *skb = sockb->sk;
1231
1232	/* Join our sockets back to back */
1233	sock_hold(ska);
1234	sock_hold(skb);
1235	unix_peer(ska) = skb;
1236	unix_peer(skb) = ska;
1237	init_peercred(ska);
1238	init_peercred(skb);
1239
1240	if (ska->sk_type != SOCK_DGRAM) {
1241		ska->sk_state = TCP_ESTABLISHED;
1242		skb->sk_state = TCP_ESTABLISHED;
1243		socka->state  = SS_CONNECTED;
1244		sockb->state  = SS_CONNECTED;
1245	}
1246	return 0;
1247}
1248
1249static void unix_sock_inherit_flags(const struct socket *old,
1250				    struct socket *new)
1251{
1252	if (test_bit(SOCK_PASSCRED, &old->flags))
1253		set_bit(SOCK_PASSCRED, &new->flags);
1254	if (test_bit(SOCK_PASSSEC, &old->flags))
1255		set_bit(SOCK_PASSSEC, &new->flags);
1256}
1257
1258static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1259{
1260	struct sock *sk = sock->sk;
1261	struct sock *tsk;
1262	struct sk_buff *skb;
1263	int err;
1264
1265	err = -EOPNOTSUPP;
1266	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1267		goto out;
1268
1269	err = -EINVAL;
1270	if (sk->sk_state != TCP_LISTEN)
1271		goto out;
1272
1273	/* If socket state is TCP_LISTEN it cannot change (for now...),
1274	 * so that no locks are necessary.
1275	 */
1276
1277	skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1278	if (!skb) {
1279		/* This means receive shutdown. */
1280		if (err == 0)
1281			err = -EINVAL;
1282		goto out;
1283	}
1284
1285	tsk = skb->sk;
1286	skb_free_datagram(sk, skb);
1287	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1288
1289	/* attach accepted sock to socket */
1290	unix_state_lock(tsk);
1291	newsock->state = SS_CONNECTED;
1292	unix_sock_inherit_flags(sock, newsock);
1293	sock_graft(tsk, newsock);
1294	unix_state_unlock(tsk);
1295	return 0;
1296
1297out:
1298	return err;
1299}
1300
1301
1302static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1303{
1304	struct sock *sk = sock->sk;
1305	struct unix_sock *u;
1306	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1307	int err = 0;
1308
1309	if (peer) {
1310		sk = unix_peer_get(sk);
1311
1312		err = -ENOTCONN;
1313		if (!sk)
1314			goto out;
1315		err = 0;
1316	} else {
1317		sock_hold(sk);
1318	}
1319
1320	u = unix_sk(sk);
1321	unix_state_lock(sk);
1322	if (!u->addr) {
1323		sunaddr->sun_family = AF_UNIX;
1324		sunaddr->sun_path[0] = 0;
1325		*uaddr_len = sizeof(short);
1326	} else {
1327		struct unix_address *addr = u->addr;
1328
1329		*uaddr_len = addr->len;
1330		memcpy(sunaddr, addr->name, *uaddr_len);
1331	}
1332	unix_state_unlock(sk);
1333	sock_put(sk);
1334out:
1335	return err;
1336}
1337
1338static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1339{
1340	int i;
1341
1342	scm->fp = UNIXCB(skb).fp;
1343	UNIXCB(skb).fp = NULL;
1344
1345	for (i = scm->fp->count-1; i >= 0; i--)
1346		unix_notinflight(scm->fp->fp[i]);
1347}
1348
1349static void unix_destruct_scm(struct sk_buff *skb)
1350{
1351	struct scm_cookie scm;
1352	memset(&scm, 0, sizeof(scm));
1353	scm.pid  = UNIXCB(skb).pid;
1354	if (UNIXCB(skb).fp)
1355		unix_detach_fds(&scm, skb);
1356
1357	/* Alas, it calls VFS */
1358	/* So fscking what? fput() had been SMP-safe since the last Summer */
1359	scm_destroy(&scm);
1360	sock_wfree(skb);
1361}
1362
1363#define MAX_RECURSION_LEVEL 4
1364
1365static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1366{
1367	int i;
1368	unsigned char max_level = 0;
1369	int unix_sock_count = 0;
1370
1371	for (i = scm->fp->count - 1; i >= 0; i--) {
1372		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1373
1374		if (sk) {
1375			unix_sock_count++;
1376			max_level = max(max_level,
1377					unix_sk(sk)->recursion_level);
1378		}
1379	}
1380	if (unlikely(max_level > MAX_RECURSION_LEVEL))
1381		return -ETOOMANYREFS;
1382
1383	/*
1384	 * Need to duplicate file references for the sake of garbage
1385	 * collection.  Otherwise a socket in the fps might become a
1386	 * candidate for GC while the skb is not yet queued.
1387	 */
1388	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1389	if (!UNIXCB(skb).fp)
1390		return -ENOMEM;
1391
1392	if (unix_sock_count) {
1393		for (i = scm->fp->count - 1; i >= 0; i--)
1394			unix_inflight(scm->fp->fp[i]);
1395	}
1396	return max_level;
1397}
1398
1399static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1400{
1401	int err = 0;
1402
1403	UNIXCB(skb).pid  = get_pid(scm->pid);
1404	UNIXCB(skb).uid = scm->creds.uid;
1405	UNIXCB(skb).gid = scm->creds.gid;
1406	UNIXCB(skb).fp = NULL;
1407	if (scm->fp && send_fds)
1408		err = unix_attach_fds(scm, skb);
1409
1410	skb->destructor = unix_destruct_scm;
1411	return err;
1412}
1413
1414/*
1415 * Some apps rely on write() giving SCM_CREDENTIALS
1416 * We include credentials if source or destination socket
1417 * asserted SOCK_PASSCRED.
1418 */
1419static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1420			    const struct sock *other)
1421{
1422	if (UNIXCB(skb).pid)
1423		return;
1424	if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1425	    !other->sk_socket ||
1426	    test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1427		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1428		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1429	}
1430}
1431
1432/*
1433 *	Send AF_UNIX data.
1434 */
1435
1436static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1437			      struct msghdr *msg, size_t len)
1438{
1439	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1440	struct sock *sk = sock->sk;
1441	struct net *net = sock_net(sk);
1442	struct unix_sock *u = unix_sk(sk);
1443	struct sockaddr_un *sunaddr = msg->msg_name;
1444	struct sock *other = NULL;
1445	int namelen = 0; /* fake GCC */
1446	int err;
1447	unsigned int hash;
1448	struct sk_buff *skb;
1449	long timeo;
1450	struct scm_cookie tmp_scm;
1451	int max_level;
1452	int data_len = 0;
1453
1454	if (NULL == siocb->scm)
1455		siocb->scm = &tmp_scm;
1456	wait_for_unix_gc();
1457	err = scm_send(sock, msg, siocb->scm, false);
1458	if (err < 0)
1459		return err;
1460
1461	err = -EOPNOTSUPP;
1462	if (msg->msg_flags&MSG_OOB)
1463		goto out;
1464
1465	if (msg->msg_namelen) {
1466		err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1467		if (err < 0)
1468			goto out;
1469		namelen = err;
1470	} else {
1471		sunaddr = NULL;
1472		err = -ENOTCONN;
1473		other = unix_peer_get(sk);
1474		if (!other)
1475			goto out;
1476	}
1477
1478	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1479	    && (err = unix_autobind(sock)) != 0)
1480		goto out;
1481
1482	err = -EMSGSIZE;
1483	if (len > sk->sk_sndbuf - 32)
1484		goto out;
1485
1486	if (len > SKB_MAX_ALLOC)
1487		data_len = min_t(size_t,
1488				 len - SKB_MAX_ALLOC,
1489				 MAX_SKB_FRAGS * PAGE_SIZE);
1490
1491	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1492				   msg->msg_flags & MSG_DONTWAIT, &err,
1493				   PAGE_ALLOC_COSTLY_ORDER);
1494	if (skb == NULL)
1495		goto out;
1496
1497	err = unix_scm_to_skb(siocb->scm, skb, true);
1498	if (err < 0)
1499		goto out_free;
1500	max_level = err + 1;
1501	unix_get_secdata(siocb->scm, skb);
1502
1503	skb_put(skb, len - data_len);
1504	skb->data_len = data_len;
1505	skb->len = len;
1506	err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, len);
1507	if (err)
1508		goto out_free;
1509
1510	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1511
1512restart:
1513	if (!other) {
1514		err = -ECONNRESET;
1515		if (sunaddr == NULL)
1516			goto out_free;
1517
1518		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1519					hash, &err);
1520		if (other == NULL)
1521			goto out_free;
1522	}
1523
1524	if (sk_filter(other, skb) < 0) {
1525		/* Toss the packet but do not return any error to the sender */
1526		err = len;
1527		goto out_free;
1528	}
1529
1530	unix_state_lock(other);
1531	err = -EPERM;
1532	if (!unix_may_send(sk, other))
1533		goto out_unlock;
1534
1535	if (sock_flag(other, SOCK_DEAD)) {
1536		/*
1537		 *	Check with 1003.1g - what should
1538		 *	datagram error
1539		 */
1540		unix_state_unlock(other);
1541		sock_put(other);
1542
1543		err = 0;
1544		unix_state_lock(sk);
1545		if (unix_peer(sk) == other) {
1546			unix_peer(sk) = NULL;
1547			unix_state_unlock(sk);
1548
1549			unix_dgram_disconnected(sk, other);
1550			sock_put(other);
1551			err = -ECONNREFUSED;
1552		} else {
1553			unix_state_unlock(sk);
1554		}
1555
1556		other = NULL;
1557		if (err)
1558			goto out_free;
1559		goto restart;
1560	}
1561
1562	err = -EPIPE;
1563	if (other->sk_shutdown & RCV_SHUTDOWN)
1564		goto out_unlock;
1565
1566	if (sk->sk_type != SOCK_SEQPACKET) {
1567		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1568		if (err)
1569			goto out_unlock;
1570	}
1571
1572	if (unix_peer(other) != sk && unix_recvq_full(other)) {
1573		if (!timeo) {
1574			err = -EAGAIN;
1575			goto out_unlock;
1576		}
1577
1578		timeo = unix_wait_for_peer(other, timeo);
1579
1580		err = sock_intr_errno(timeo);
1581		if (signal_pending(current))
1582			goto out_free;
1583
1584		goto restart;
1585	}
1586
1587	if (sock_flag(other, SOCK_RCVTSTAMP))
1588		__net_timestamp(skb);
1589	maybe_add_creds(skb, sock, other);
1590	skb_queue_tail(&other->sk_receive_queue, skb);
1591	if (max_level > unix_sk(other)->recursion_level)
1592		unix_sk(other)->recursion_level = max_level;
1593	unix_state_unlock(other);
1594	other->sk_data_ready(other, len);
1595	sock_put(other);
1596	scm_destroy(siocb->scm);
1597	return len;
1598
1599out_unlock:
1600	unix_state_unlock(other);
1601out_free:
1602	kfree_skb(skb);
1603out:
1604	if (other)
1605		sock_put(other);
1606	scm_destroy(siocb->scm);
1607	return err;
1608}
1609
1610/* We use paged skbs for stream sockets, and limit occupancy to 32768
1611 * bytes, and a minimun of a full page.
1612 */
1613#define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1614
1615static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1616			       struct msghdr *msg, size_t len)
1617{
1618	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1619	struct sock *sk = sock->sk;
1620	struct sock *other = NULL;
1621	int err, size;
1622	struct sk_buff *skb;
1623	int sent = 0;
1624	struct scm_cookie tmp_scm;
1625	bool fds_sent = false;
1626	int max_level;
1627	int data_len;
1628
1629	if (NULL == siocb->scm)
1630		siocb->scm = &tmp_scm;
1631	wait_for_unix_gc();
1632	err = scm_send(sock, msg, siocb->scm, false);
1633	if (err < 0)
1634		return err;
1635
1636	err = -EOPNOTSUPP;
1637	if (msg->msg_flags&MSG_OOB)
1638		goto out_err;
1639
1640	if (msg->msg_namelen) {
1641		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1642		goto out_err;
1643	} else {
1644		err = -ENOTCONN;
1645		other = unix_peer(sk);
1646		if (!other)
1647			goto out_err;
1648	}
1649
1650	if (sk->sk_shutdown & SEND_SHUTDOWN)
1651		goto pipe_err;
1652
1653	while (sent < len) {
1654		size = len - sent;
1655
1656		/* Keep two messages in the pipe so it schedules better */
1657		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1658
1659		/* allow fallback to order-0 allocations */
1660		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1661
1662		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1663
1664		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1665					   msg->msg_flags & MSG_DONTWAIT, &err,
1666					   get_order(UNIX_SKB_FRAGS_SZ));
1667		if (!skb)
1668			goto out_err;
1669
1670		/* Only send the fds in the first buffer */
1671		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
1672		if (err < 0) {
1673			kfree_skb(skb);
1674			goto out_err;
1675		}
1676		max_level = err + 1;
1677		fds_sent = true;
1678
1679		skb_put(skb, size - data_len);
1680		skb->data_len = data_len;
1681		skb->len = size;
1682		err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov,
1683						   sent, size);
1684		if (err) {
1685			kfree_skb(skb);
1686			goto out_err;
1687		}
1688
1689		unix_state_lock(other);
1690
1691		if (sock_flag(other, SOCK_DEAD) ||
1692		    (other->sk_shutdown & RCV_SHUTDOWN))
1693			goto pipe_err_free;
1694
1695		maybe_add_creds(skb, sock, other);
1696		skb_queue_tail(&other->sk_receive_queue, skb);
1697		if (max_level > unix_sk(other)->recursion_level)
1698			unix_sk(other)->recursion_level = max_level;
1699		unix_state_unlock(other);
1700		other->sk_data_ready(other, size);
1701		sent += size;
1702	}
1703
1704	scm_destroy(siocb->scm);
1705	siocb->scm = NULL;
1706
1707	return sent;
1708
1709pipe_err_free:
1710	unix_state_unlock(other);
1711	kfree_skb(skb);
1712pipe_err:
1713	if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1714		send_sig(SIGPIPE, current, 0);
1715	err = -EPIPE;
1716out_err:
1717	scm_destroy(siocb->scm);
1718	siocb->scm = NULL;
1719	return sent ? : err;
1720}
1721
1722static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1723				  struct msghdr *msg, size_t len)
1724{
1725	int err;
1726	struct sock *sk = sock->sk;
1727
1728	err = sock_error(sk);
1729	if (err)
1730		return err;
1731
1732	if (sk->sk_state != TCP_ESTABLISHED)
1733		return -ENOTCONN;
1734
1735	if (msg->msg_namelen)
1736		msg->msg_namelen = 0;
1737
1738	return unix_dgram_sendmsg(kiocb, sock, msg, len);
1739}
1740
1741static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
1742			      struct msghdr *msg, size_t size,
1743			      int flags)
1744{
1745	struct sock *sk = sock->sk;
1746
1747	if (sk->sk_state != TCP_ESTABLISHED)
1748		return -ENOTCONN;
1749
1750	return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
1751}
1752
1753static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1754{
1755	struct unix_sock *u = unix_sk(sk);
1756
1757	if (u->addr) {
1758		msg->msg_namelen = u->addr->len;
1759		memcpy(msg->msg_name, u->addr->name, u->addr->len);
1760	}
1761}
1762
1763static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1764			      struct msghdr *msg, size_t size,
1765			      int flags)
1766{
1767	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1768	struct scm_cookie tmp_scm;
1769	struct sock *sk = sock->sk;
1770	struct unix_sock *u = unix_sk(sk);
1771	int noblock = flags & MSG_DONTWAIT;
1772	struct sk_buff *skb;
1773	int err;
1774	int peeked, skip;
1775
1776	err = -EOPNOTSUPP;
1777	if (flags&MSG_OOB)
1778		goto out;
1779
1780	err = mutex_lock_interruptible(&u->readlock);
1781	if (err) {
1782		err = sock_intr_errno(sock_rcvtimeo(sk, noblock));
1783		goto out;
1784	}
1785
1786	skip = sk_peek_offset(sk, flags);
1787
1788	skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1789	if (!skb) {
1790		unix_state_lock(sk);
1791		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1792		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1793		    (sk->sk_shutdown & RCV_SHUTDOWN))
1794			err = 0;
1795		unix_state_unlock(sk);
1796		goto out_unlock;
1797	}
1798
1799	wake_up_interruptible_sync_poll(&u->peer_wait,
1800					POLLOUT | POLLWRNORM | POLLWRBAND);
1801
1802	if (msg->msg_name)
1803		unix_copy_addr(msg, skb->sk);
1804
1805	if (size > skb->len - skip)
1806		size = skb->len - skip;
1807	else if (size < skb->len - skip)
1808		msg->msg_flags |= MSG_TRUNC;
1809
1810	err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
1811	if (err)
1812		goto out_free;
1813
1814	if (sock_flag(sk, SOCK_RCVTSTAMP))
1815		__sock_recv_timestamp(msg, sk, skb);
1816
1817	if (!siocb->scm) {
1818		siocb->scm = &tmp_scm;
1819		memset(&tmp_scm, 0, sizeof(tmp_scm));
1820	}
1821	scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1822	unix_set_secdata(siocb->scm, skb);
1823
1824	if (!(flags & MSG_PEEK)) {
1825		if (UNIXCB(skb).fp)
1826			unix_detach_fds(siocb->scm, skb);
1827
1828		sk_peek_offset_bwd(sk, skb->len);
1829	} else {
1830		/* It is questionable: on PEEK we could:
1831		   - do not return fds - good, but too simple 8)
1832		   - return fds, and do not return them on read (old strategy,
1833		     apparently wrong)
1834		   - clone fds (I chose it for now, it is the most universal
1835		     solution)
1836
1837		   POSIX 1003.1g does not actually define this clearly
1838		   at all. POSIX 1003.1g doesn't define a lot of things
1839		   clearly however!
1840
1841		*/
1842
1843		sk_peek_offset_fwd(sk, size);
1844
1845		if (UNIXCB(skb).fp)
1846			siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1847	}
1848	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1849
1850	scm_recv(sock, msg, siocb->scm, flags);
1851
1852out_free:
1853	skb_free_datagram(sk, skb);
1854out_unlock:
1855	mutex_unlock(&u->readlock);
1856out:
1857	return err;
1858}
1859
1860/*
1861 *	Sleep until more data has arrived. But check for races..
1862 */
1863static long unix_stream_data_wait(struct sock *sk, long timeo,
1864				  struct sk_buff *last)
1865{
1866	DEFINE_WAIT(wait);
1867
1868	unix_state_lock(sk);
1869
1870	for (;;) {
1871		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1872
1873		if (skb_peek_tail(&sk->sk_receive_queue) != last ||
1874		    sk->sk_err ||
1875		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
1876		    signal_pending(current) ||
1877		    !timeo)
1878			break;
1879
1880		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1881		unix_state_unlock(sk);
1882		timeo = freezable_schedule_timeout(timeo);
1883		unix_state_lock(sk);
1884		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1885	}
1886
1887	finish_wait(sk_sleep(sk), &wait);
1888	unix_state_unlock(sk);
1889	return timeo;
1890}
1891
1892static unsigned int unix_skb_len(const struct sk_buff *skb)
1893{
1894	return skb->len - UNIXCB(skb).consumed;
1895}
1896
1897static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1898			       struct msghdr *msg, size_t size,
1899			       int flags)
1900{
1901	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1902	struct scm_cookie tmp_scm;
1903	struct sock *sk = sock->sk;
1904	struct unix_sock *u = unix_sk(sk);
1905	struct sockaddr_un *sunaddr = msg->msg_name;
1906	int copied = 0;
1907	int check_creds = 0;
1908	int target;
1909	int err = 0;
1910	long timeo;
1911	int skip;
1912
1913	err = -EINVAL;
1914	if (sk->sk_state != TCP_ESTABLISHED)
1915		goto out;
1916
1917	err = -EOPNOTSUPP;
1918	if (flags&MSG_OOB)
1919		goto out;
1920
1921	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1922	timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1923
1924	/* Lock the socket to prevent queue disordering
1925	 * while sleeps in memcpy_tomsg
1926	 */
1927
1928	if (!siocb->scm) {
1929		siocb->scm = &tmp_scm;
1930		memset(&tmp_scm, 0, sizeof(tmp_scm));
1931	}
1932
1933	err = mutex_lock_interruptible(&u->readlock);
1934	if (err) {
1935		err = sock_intr_errno(timeo);
1936		goto out;
1937	}
1938
1939	do {
1940		int chunk;
1941		struct sk_buff *skb, *last;
1942
1943		unix_state_lock(sk);
1944		last = skb = skb_peek(&sk->sk_receive_queue);
1945again:
1946		if (skb == NULL) {
1947			unix_sk(sk)->recursion_level = 0;
1948			if (copied >= target)
1949				goto unlock;
1950
1951			/*
1952			 *	POSIX 1003.1g mandates this order.
1953			 */
1954
1955			err = sock_error(sk);
1956			if (err)
1957				goto unlock;
1958			if (sk->sk_shutdown & RCV_SHUTDOWN)
1959				goto unlock;
1960
1961			unix_state_unlock(sk);
1962			err = -EAGAIN;
1963			if (!timeo)
1964				break;
1965			mutex_unlock(&u->readlock);
1966
1967			timeo = unix_stream_data_wait(sk, timeo, last);
1968
1969			if (signal_pending(current)
1970			    ||  mutex_lock_interruptible(&u->readlock)) {
1971				err = sock_intr_errno(timeo);
1972				goto out;
1973			}
1974
1975			continue;
1976 unlock:
1977			unix_state_unlock(sk);
1978			break;
1979		}
1980
1981		skip = sk_peek_offset(sk, flags);
1982		while (skip >= unix_skb_len(skb)) {
1983			skip -= unix_skb_len(skb);
1984			last = skb;
1985			skb = skb_peek_next(skb, &sk->sk_receive_queue);
1986			if (!skb)
1987				goto again;
1988		}
1989
1990		unix_state_unlock(sk);
1991
1992		if (check_creds) {
1993			/* Never glue messages from different writers */
1994			if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
1995			    !uid_eq(UNIXCB(skb).uid, siocb->scm->creds.uid) ||
1996			    !gid_eq(UNIXCB(skb).gid, siocb->scm->creds.gid))
1997				break;
1998		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
1999			/* Copy credentials */
2000			scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2001			check_creds = 1;
2002		}
2003
2004		/* Copy address just once */
2005		if (sunaddr) {
2006			unix_copy_addr(msg, skb->sk);
2007			sunaddr = NULL;
2008		}
2009
2010		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2011		if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip,
2012					    msg->msg_iov, chunk)) {
2013			if (copied == 0)
2014				copied = -EFAULT;
2015			break;
2016		}
2017		copied += chunk;
2018		size -= chunk;
2019
2020		/* Mark read part of skb as used */
2021		if (!(flags & MSG_PEEK)) {
2022			UNIXCB(skb).consumed += chunk;
2023
2024			sk_peek_offset_bwd(sk, chunk);
2025
2026			if (UNIXCB(skb).fp)
2027				unix_detach_fds(siocb->scm, skb);
2028
2029			if (unix_skb_len(skb))
2030				break;
2031
2032			skb_unlink(skb, &sk->sk_receive_queue);
2033			consume_skb(skb);
2034
2035			if (siocb->scm->fp)
2036				break;
2037		} else {
2038			/* It is questionable, see note in unix_dgram_recvmsg.
2039			 */
2040			if (UNIXCB(skb).fp)
2041				siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2042
2043			sk_peek_offset_fwd(sk, chunk);
2044
2045			break;
2046		}
2047	} while (size);
2048
2049	mutex_unlock(&u->readlock);
2050	scm_recv(sock, msg, siocb->scm, flags);
2051out:
2052	return copied ? : err;
2053}
2054
2055static int unix_shutdown(struct socket *sock, int mode)
2056{
2057	struct sock *sk = sock->sk;
2058	struct sock *other;
2059
2060	if (mode < SHUT_RD || mode > SHUT_RDWR)
2061		return -EINVAL;
2062	/* This maps:
2063	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2064	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2065	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2066	 */
2067	++mode;
2068
2069	unix_state_lock(sk);
2070	sk->sk_shutdown |= mode;
2071	other = unix_peer(sk);
2072	if (other)
2073		sock_hold(other);
2074	unix_state_unlock(sk);
2075	sk->sk_state_change(sk);
2076
2077	if (other &&
2078		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2079
2080		int peer_mode = 0;
2081
2082		if (mode&RCV_SHUTDOWN)
2083			peer_mode |= SEND_SHUTDOWN;
2084		if (mode&SEND_SHUTDOWN)
2085			peer_mode |= RCV_SHUTDOWN;
2086		unix_state_lock(other);
2087		other->sk_shutdown |= peer_mode;
2088		unix_state_unlock(other);
2089		other->sk_state_change(other);
2090		if (peer_mode == SHUTDOWN_MASK)
2091			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2092		else if (peer_mode & RCV_SHUTDOWN)
2093			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2094	}
2095	if (other)
2096		sock_put(other);
2097
2098	return 0;
2099}
2100
2101long unix_inq_len(struct sock *sk)
2102{
2103	struct sk_buff *skb;
2104	long amount = 0;
2105
2106	if (sk->sk_state == TCP_LISTEN)
2107		return -EINVAL;
2108
2109	spin_lock(&sk->sk_receive_queue.lock);
2110	if (sk->sk_type == SOCK_STREAM ||
2111	    sk->sk_type == SOCK_SEQPACKET) {
2112		skb_queue_walk(&sk->sk_receive_queue, skb)
2113			amount += unix_skb_len(skb);
2114	} else {
2115		skb = skb_peek(&sk->sk_receive_queue);
2116		if (skb)
2117			amount = skb->len;
2118	}
2119	spin_unlock(&sk->sk_receive_queue.lock);
2120
2121	return amount;
2122}
2123EXPORT_SYMBOL_GPL(unix_inq_len);
2124
2125long unix_outq_len(struct sock *sk)
2126{
2127	return sk_wmem_alloc_get(sk);
2128}
2129EXPORT_SYMBOL_GPL(unix_outq_len);
2130
2131static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2132{
2133	struct sock *sk = sock->sk;
2134	long amount = 0;
2135	int err;
2136
2137	switch (cmd) {
2138	case SIOCOUTQ:
2139		amount = unix_outq_len(sk);
2140		err = put_user(amount, (int __user *)arg);
2141		break;
2142	case SIOCINQ:
2143		amount = unix_inq_len(sk);
2144		if (amount < 0)
2145			err = amount;
2146		else
2147			err = put_user(amount, (int __user *)arg);
2148		break;
2149	default:
2150		err = -ENOIOCTLCMD;
2151		break;
2152	}
2153	return err;
2154}
2155
2156static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2157{
2158	struct sock *sk = sock->sk;
2159	unsigned int mask;
2160
2161	sock_poll_wait(file, sk_sleep(sk), wait);
2162	mask = 0;
2163
2164	/* exceptional events? */
2165	if (sk->sk_err)
2166		mask |= POLLERR;
2167	if (sk->sk_shutdown == SHUTDOWN_MASK)
2168		mask |= POLLHUP;
2169	if (sk->sk_shutdown & RCV_SHUTDOWN)
2170		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2171
2172	/* readable? */
2173	if (!skb_queue_empty(&sk->sk_receive_queue))
2174		mask |= POLLIN | POLLRDNORM;
2175
2176	/* Connection-based need to check for termination and startup */
2177	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2178	    sk->sk_state == TCP_CLOSE)
2179		mask |= POLLHUP;
2180
2181	/*
2182	 * we set writable also when the other side has shut down the
2183	 * connection. This prevents stuck sockets.
2184	 */
2185	if (unix_writable(sk))
2186		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2187
2188	return mask;
2189}
2190
2191static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2192				    poll_table *wait)
2193{
2194	struct sock *sk = sock->sk, *other;
2195	unsigned int mask, writable;
2196
2197	sock_poll_wait(file, sk_sleep(sk), wait);
2198	mask = 0;
2199
2200	/* exceptional events? */
2201	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2202		mask |= POLLERR |
2203			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2204
2205	if (sk->sk_shutdown & RCV_SHUTDOWN)
2206		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2207	if (sk->sk_shutdown == SHUTDOWN_MASK)
2208		mask |= POLLHUP;
2209
2210	/* readable? */
2211	if (!skb_queue_empty(&sk->sk_receive_queue))
2212		mask |= POLLIN | POLLRDNORM;
2213
2214	/* Connection-based need to check for termination and startup */
2215	if (sk->sk_type == SOCK_SEQPACKET) {
2216		if (sk->sk_state == TCP_CLOSE)
2217			mask |= POLLHUP;
2218		/* connection hasn't started yet? */
2219		if (sk->sk_state == TCP_SYN_SENT)
2220			return mask;
2221	}
2222
2223	/* No write status requested, avoid expensive OUT tests. */
2224	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2225		return mask;
2226
2227	writable = unix_writable(sk);
2228	other = unix_peer_get(sk);
2229	if (other) {
2230		if (unix_peer(other) != sk) {
2231			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2232			if (unix_recvq_full(other))
2233				writable = 0;
2234		}
2235		sock_put(other);
2236	}
2237
2238	if (writable)
2239		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2240	else
2241		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2242
2243	return mask;
2244}
2245
2246#ifdef CONFIG_PROC_FS
2247
2248#define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2249
2250#define get_bucket(x) ((x) >> BUCKET_SPACE)
2251#define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2252#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2253
2254static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2255{
2256	unsigned long offset = get_offset(*pos);
2257	unsigned long bucket = get_bucket(*pos);
2258	struct sock *sk;
2259	unsigned long count = 0;
2260
2261	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2262		if (sock_net(sk) != seq_file_net(seq))
2263			continue;
2264		if (++count == offset)
2265			break;
2266	}
2267
2268	return sk;
2269}
2270
2271static struct sock *unix_next_socket(struct seq_file *seq,
2272				     struct sock *sk,
2273				     loff_t *pos)
2274{
2275	unsigned long bucket;
2276
2277	while (sk > (struct sock *)SEQ_START_TOKEN) {
2278		sk = sk_next(sk);
2279		if (!sk)
2280			goto next_bucket;
2281		if (sock_net(sk) == seq_file_net(seq))
2282			return sk;
2283	}
2284
2285	do {
2286		sk = unix_from_bucket(seq, pos);
2287		if (sk)
2288			return sk;
2289
2290next_bucket:
2291		bucket = get_bucket(*pos) + 1;
2292		*pos = set_bucket_offset(bucket, 1);
2293	} while (bucket < ARRAY_SIZE(unix_socket_table));
2294
2295	return NULL;
2296}
2297
2298static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2299	__acquires(unix_table_lock)
2300{
2301	spin_lock(&unix_table_lock);
2302
2303	if (!*pos)
2304		return SEQ_START_TOKEN;
2305
2306	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2307		return NULL;
2308
2309	return unix_next_socket(seq, NULL, pos);
2310}
2311
2312static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2313{
2314	++*pos;
2315	return unix_next_socket(seq, v, pos);
2316}
2317
2318static void unix_seq_stop(struct seq_file *seq, void *v)
2319	__releases(unix_table_lock)
2320{
2321	spin_unlock(&unix_table_lock);
2322}
2323
2324static int unix_seq_show(struct seq_file *seq, void *v)
2325{
2326
2327	if (v == SEQ_START_TOKEN)
2328		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2329			 "Inode Path\n");
2330	else {
2331		struct sock *s = v;
2332		struct unix_sock *u = unix_sk(s);
2333		unix_state_lock(s);
2334
2335		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2336			s,
2337			atomic_read(&s->sk_refcnt),
2338			0,
2339			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2340			s->sk_type,
2341			s->sk_socket ?
2342			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2343			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2344			sock_i_ino(s));
2345
2346		if (u->addr) {
2347			int i, len;
2348			seq_putc(seq, ' ');
2349
2350			i = 0;
2351			len = u->addr->len - sizeof(short);
2352			if (!UNIX_ABSTRACT(s))
2353				len--;
2354			else {
2355				seq_putc(seq, '@');
2356				i++;
2357			}
2358			for ( ; i < len; i++)
2359				seq_putc(seq, u->addr->name->sun_path[i]);
2360		}
2361		unix_state_unlock(s);
2362		seq_putc(seq, '\n');
2363	}
2364
2365	return 0;
2366}
2367
2368static const struct seq_operations unix_seq_ops = {
2369	.start  = unix_seq_start,
2370	.next   = unix_seq_next,
2371	.stop   = unix_seq_stop,
2372	.show   = unix_seq_show,
2373};
2374
2375static int unix_seq_open(struct inode *inode, struct file *file)
2376{
2377	return seq_open_net(inode, file, &unix_seq_ops,
2378			    sizeof(struct seq_net_private));
2379}
2380
2381static const struct file_operations unix_seq_fops = {
2382	.owner		= THIS_MODULE,
2383	.open		= unix_seq_open,
2384	.read		= seq_read,
2385	.llseek		= seq_lseek,
2386	.release	= seq_release_net,
2387};
2388
2389#endif
2390
2391static const struct net_proto_family unix_family_ops = {
2392	.family = PF_UNIX,
2393	.create = unix_create,
2394	.owner	= THIS_MODULE,
2395};
2396
2397
2398static int __net_init unix_net_init(struct net *net)
2399{
2400	int error = -ENOMEM;
2401
2402	net->unx.sysctl_max_dgram_qlen = 10;
2403	if (unix_sysctl_register(net))
2404		goto out;
2405
2406#ifdef CONFIG_PROC_FS
2407	if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2408		unix_sysctl_unregister(net);
2409		goto out;
2410	}
2411#endif
2412	error = 0;
2413out:
2414	return error;
2415}
2416
2417static void __net_exit unix_net_exit(struct net *net)
2418{
2419	unix_sysctl_unregister(net);
2420	remove_proc_entry("unix", net->proc_net);
2421}
2422
2423static struct pernet_operations unix_net_ops = {
2424	.init = unix_net_init,
2425	.exit = unix_net_exit,
2426};
2427
2428static int __init af_unix_init(void)
2429{
2430	int rc = -1;
2431
2432	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2433
2434	rc = proto_register(&unix_proto, 1);
2435	if (rc != 0) {
2436		printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2437		       __func__);
2438		goto out;
2439	}
2440
2441	sock_register(&unix_family_ops);
2442	register_pernet_subsys(&unix_net_ops);
2443out:
2444	return rc;
2445}
2446
2447static void __exit af_unix_exit(void)
2448{
2449	sock_unregister(PF_UNIX);
2450	proto_unregister(&unix_proto);
2451	unregister_pernet_subsys(&unix_net_ops);
2452}
2453
2454/* Earlier than device_initcall() so that other drivers invoking
2455   request_module() don't end up in a loop when modprobe tries
2456   to use a UNIX socket. But later than subsys_initcall() because
2457   we depend on stuff initialised there */
2458fs_initcall(af_unix_init);
2459module_exit(af_unix_exit);
2460
2461MODULE_LICENSE("GPL");
2462MODULE_ALIAS_NETPROTO(PF_UNIX);
2463