1/*
2  FUSE: Filesystem in Userspace
3  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
4
5  This program can be distributed under the terms of the GNU GPL.
6  See the file COPYING.
7*/
8
9#include "fuse_i.h"
10
11#include <linux/init.h>
12#include <linux/module.h>
13#include <linux/poll.h>
14#include <linux/uio.h>
15#include <linux/miscdevice.h>
16#include <linux/pagemap.h>
17#include <linux/file.h>
18#include <linux/slab.h>
19#include <linux/pipe_fs_i.h>
20#include <linux/swap.h>
21#include <linux/splice.h>
22#include <linux/aio.h>
23#include <linux/freezer.h>
24
25MODULE_ALIAS_MISCDEV(FUSE_MINOR);
26MODULE_ALIAS("devname:fuse");
27
28static struct kmem_cache *fuse_req_cachep;
29
30static struct fuse_conn *fuse_get_conn(struct file *file)
31{
32	/*
33	 * Lockless access is OK, because file->private data is set
34	 * once during mount and is valid until the file is released.
35	 */
36	return file->private_data;
37}
38
39static void fuse_request_init(struct fuse_req *req, struct page **pages,
40			      struct fuse_page_desc *page_descs,
41			      unsigned npages)
42{
43	memset(req, 0, sizeof(*req));
44	memset(pages, 0, sizeof(*pages) * npages);
45	memset(page_descs, 0, sizeof(*page_descs) * npages);
46	INIT_LIST_HEAD(&req->list);
47	INIT_LIST_HEAD(&req->intr_entry);
48	init_waitqueue_head(&req->waitq);
49	atomic_set(&req->count, 1);
50	req->pages = pages;
51	req->page_descs = page_descs;
52	req->max_pages = npages;
53}
54
55static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags)
56{
57	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags);
58	if (req) {
59		struct page **pages;
60		struct fuse_page_desc *page_descs;
61
62		if (npages <= FUSE_REQ_INLINE_PAGES) {
63			pages = req->inline_pages;
64			page_descs = req->inline_page_descs;
65		} else {
66			pages = kmalloc(sizeof(struct page *) * npages, flags);
67			page_descs = kmalloc(sizeof(struct fuse_page_desc) *
68					     npages, flags);
69		}
70
71		if (!pages || !page_descs) {
72			kfree(pages);
73			kfree(page_descs);
74			kmem_cache_free(fuse_req_cachep, req);
75			return NULL;
76		}
77
78		fuse_request_init(req, pages, page_descs, npages);
79	}
80	return req;
81}
82
83struct fuse_req *fuse_request_alloc(unsigned npages)
84{
85	return __fuse_request_alloc(npages, GFP_KERNEL);
86}
87EXPORT_SYMBOL_GPL(fuse_request_alloc);
88
89struct fuse_req *fuse_request_alloc_nofs(unsigned npages)
90{
91	return __fuse_request_alloc(npages, GFP_NOFS);
92}
93
94void fuse_request_free(struct fuse_req *req)
95{
96	if (req->pages != req->inline_pages) {
97		kfree(req->pages);
98		kfree(req->page_descs);
99	}
100	kmem_cache_free(fuse_req_cachep, req);
101}
102
103static void block_sigs(sigset_t *oldset)
104{
105	sigset_t mask;
106
107	siginitsetinv(&mask, sigmask(SIGKILL));
108	sigprocmask(SIG_BLOCK, &mask, oldset);
109}
110
111static void restore_sigs(sigset_t *oldset)
112{
113	sigprocmask(SIG_SETMASK, oldset, NULL);
114}
115
116void __fuse_get_request(struct fuse_req *req)
117{
118	atomic_inc(&req->count);
119}
120
121/* Must be called with > 1 refcount */
122static void __fuse_put_request(struct fuse_req *req)
123{
124	BUG_ON(atomic_read(&req->count) < 2);
125	atomic_dec(&req->count);
126}
127
128static void fuse_req_init_context(struct fuse_req *req)
129{
130	req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
131	req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
132	req->in.h.pid = current->pid;
133}
134
135static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
136{
137	return !fc->initialized || (for_background && fc->blocked);
138}
139
140static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
141				       bool for_background)
142{
143	struct fuse_req *req;
144	int err;
145	atomic_inc(&fc->num_waiting);
146
147	if (fuse_block_alloc(fc, for_background)) {
148		sigset_t oldset;
149		int intr;
150
151		block_sigs(&oldset);
152		intr = wait_event_interruptible_exclusive(fc->blocked_waitq,
153				!fuse_block_alloc(fc, for_background));
154		restore_sigs(&oldset);
155		err = -EINTR;
156		if (intr)
157			goto out;
158	}
159
160	err = -ENOTCONN;
161	if (!fc->connected)
162		goto out;
163
164	req = fuse_request_alloc(npages);
165	err = -ENOMEM;
166	if (!req) {
167		if (for_background)
168			wake_up(&fc->blocked_waitq);
169		goto out;
170	}
171
172	fuse_req_init_context(req);
173	req->waiting = 1;
174	req->background = for_background;
175	return req;
176
177 out:
178	atomic_dec(&fc->num_waiting);
179	return ERR_PTR(err);
180}
181
182struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages)
183{
184	return __fuse_get_req(fc, npages, false);
185}
186EXPORT_SYMBOL_GPL(fuse_get_req);
187
188struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc,
189					     unsigned npages)
190{
191	return __fuse_get_req(fc, npages, true);
192}
193EXPORT_SYMBOL_GPL(fuse_get_req_for_background);
194
195/*
196 * Return request in fuse_file->reserved_req.  However that may
197 * currently be in use.  If that is the case, wait for it to become
198 * available.
199 */
200static struct fuse_req *get_reserved_req(struct fuse_conn *fc,
201					 struct file *file)
202{
203	struct fuse_req *req = NULL;
204	struct fuse_file *ff = file->private_data;
205
206	do {
207		wait_event(fc->reserved_req_waitq, ff->reserved_req);
208		spin_lock(&fc->lock);
209		if (ff->reserved_req) {
210			req = ff->reserved_req;
211			ff->reserved_req = NULL;
212			req->stolen_file = get_file(file);
213		}
214		spin_unlock(&fc->lock);
215	} while (!req);
216
217	return req;
218}
219
220/*
221 * Put stolen request back into fuse_file->reserved_req
222 */
223static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
224{
225	struct file *file = req->stolen_file;
226	struct fuse_file *ff = file->private_data;
227
228	spin_lock(&fc->lock);
229	fuse_request_init(req, req->pages, req->page_descs, req->max_pages);
230	BUG_ON(ff->reserved_req);
231	ff->reserved_req = req;
232	wake_up_all(&fc->reserved_req_waitq);
233	spin_unlock(&fc->lock);
234	fput(file);
235}
236
237/*
238 * Gets a requests for a file operation, always succeeds
239 *
240 * This is used for sending the FLUSH request, which must get to
241 * userspace, due to POSIX locks which may need to be unlocked.
242 *
243 * If allocation fails due to OOM, use the reserved request in
244 * fuse_file.
245 *
246 * This is very unlikely to deadlock accidentally, since the
247 * filesystem should not have it's own file open.  If deadlock is
248 * intentional, it can still be broken by "aborting" the filesystem.
249 */
250struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
251					     struct file *file)
252{
253	struct fuse_req *req;
254
255	atomic_inc(&fc->num_waiting);
256	wait_event(fc->blocked_waitq, fc->initialized);
257	req = fuse_request_alloc(0);
258	if (!req)
259		req = get_reserved_req(fc, file);
260
261	fuse_req_init_context(req);
262	req->waiting = 1;
263	req->background = 0;
264	return req;
265}
266
267void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
268{
269	if (atomic_dec_and_test(&req->count)) {
270		if (unlikely(req->background)) {
271			/*
272			 * We get here in the unlikely case that a background
273			 * request was allocated but not sent
274			 */
275			spin_lock(&fc->lock);
276			if (!fc->blocked)
277				wake_up(&fc->blocked_waitq);
278			spin_unlock(&fc->lock);
279		}
280
281		if (req->waiting)
282			atomic_dec(&fc->num_waiting);
283
284		if (req->stolen_file)
285			put_reserved_req(fc, req);
286		else
287			fuse_request_free(req);
288	}
289}
290EXPORT_SYMBOL_GPL(fuse_put_request);
291
292static unsigned len_args(unsigned numargs, struct fuse_arg *args)
293{
294	unsigned nbytes = 0;
295	unsigned i;
296
297	for (i = 0; i < numargs; i++)
298		nbytes += args[i].size;
299
300	return nbytes;
301}
302
303static u64 fuse_get_unique(struct fuse_conn *fc)
304{
305	fc->reqctr++;
306	/* zero is special */
307	if (fc->reqctr == 0)
308		fc->reqctr = 1;
309
310	return fc->reqctr;
311}
312
313static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
314{
315	req->in.h.len = sizeof(struct fuse_in_header) +
316		len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
317	list_add_tail(&req->list, &fc->pending);
318	req->state = FUSE_REQ_PENDING;
319	if (!req->waiting) {
320		req->waiting = 1;
321		atomic_inc(&fc->num_waiting);
322	}
323	wake_up(&fc->waitq);
324	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
325}
326
327void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
328		       u64 nodeid, u64 nlookup)
329{
330	forget->forget_one.nodeid = nodeid;
331	forget->forget_one.nlookup = nlookup;
332
333	spin_lock(&fc->lock);
334	if (fc->connected) {
335		fc->forget_list_tail->next = forget;
336		fc->forget_list_tail = forget;
337		wake_up(&fc->waitq);
338		kill_fasync(&fc->fasync, SIGIO, POLL_IN);
339	} else {
340		kfree(forget);
341	}
342	spin_unlock(&fc->lock);
343}
344
345static void flush_bg_queue(struct fuse_conn *fc)
346{
347	while (fc->active_background < fc->max_background &&
348	       !list_empty(&fc->bg_queue)) {
349		struct fuse_req *req;
350
351		req = list_entry(fc->bg_queue.next, struct fuse_req, list);
352		list_del(&req->list);
353		fc->active_background++;
354		req->in.h.unique = fuse_get_unique(fc);
355		queue_request(fc, req);
356	}
357}
358
359/*
360 * This function is called when a request is finished.  Either a reply
361 * has arrived or it was aborted (and not yet sent) or some error
362 * occurred during communication with userspace, or the device file
363 * was closed.  The requester thread is woken up (if still waiting),
364 * the 'end' callback is called if given, else the reference to the
365 * request is released
366 *
367 * Called with fc->lock, unlocks it
368 */
369static void request_end(struct fuse_conn *fc, struct fuse_req *req)
370__releases(fc->lock)
371{
372	void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
373	req->end = NULL;
374	list_del(&req->list);
375	list_del(&req->intr_entry);
376	req->state = FUSE_REQ_FINISHED;
377	if (req->background) {
378		req->background = 0;
379
380		if (fc->num_background == fc->max_background)
381			fc->blocked = 0;
382
383		/* Wake up next waiter, if any */
384		if (!fc->blocked && waitqueue_active(&fc->blocked_waitq))
385			wake_up(&fc->blocked_waitq);
386
387		if (fc->num_background == fc->congestion_threshold &&
388		    fc->connected && fc->bdi_initialized) {
389			clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
390			clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
391		}
392		fc->num_background--;
393		fc->active_background--;
394		flush_bg_queue(fc);
395	}
396	spin_unlock(&fc->lock);
397	wake_up(&req->waitq);
398	if (end)
399		end(fc, req);
400	fuse_put_request(fc, req);
401}
402
403static void wait_answer_interruptible(struct fuse_conn *fc,
404				      struct fuse_req *req)
405__releases(fc->lock)
406__acquires(fc->lock)
407{
408	if (signal_pending(current))
409		return;
410
411	spin_unlock(&fc->lock);
412	wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED);
413	spin_lock(&fc->lock);
414}
415
416static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
417{
418	list_add_tail(&req->intr_entry, &fc->interrupts);
419	wake_up(&fc->waitq);
420	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
421}
422
423static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
424__releases(fc->lock)
425__acquires(fc->lock)
426{
427	if (!fc->no_interrupt) {
428		/* Any signal may interrupt this */
429		wait_answer_interruptible(fc, req);
430
431		if (req->aborted)
432			goto aborted;
433		if (req->state == FUSE_REQ_FINISHED)
434			return;
435
436		req->interrupted = 1;
437		if (req->state == FUSE_REQ_SENT)
438			queue_interrupt(fc, req);
439	}
440
441	if (!req->force) {
442		sigset_t oldset;
443
444		/* Only fatal signals may interrupt this */
445		block_sigs(&oldset);
446		wait_answer_interruptible(fc, req);
447		restore_sigs(&oldset);
448
449		if (req->aborted)
450			goto aborted;
451		if (req->state == FUSE_REQ_FINISHED)
452			return;
453
454		/* Request is not yet in userspace, bail out */
455		if (req->state == FUSE_REQ_PENDING) {
456			list_del(&req->list);
457			__fuse_put_request(req);
458			req->out.h.error = -EINTR;
459			return;
460		}
461	}
462
463	/*
464	 * Either request is already in userspace, or it was forced.
465	 * Wait it out.
466	 */
467	spin_unlock(&fc->lock);
468
469	while (req->state != FUSE_REQ_FINISHED)
470		wait_event_freezable(req->waitq,
471				     req->state == FUSE_REQ_FINISHED);
472	spin_lock(&fc->lock);
473
474	if (!req->aborted)
475		return;
476
477 aborted:
478	BUG_ON(req->state != FUSE_REQ_FINISHED);
479	if (req->locked) {
480		/* This is uninterruptible sleep, because data is
481		   being copied to/from the buffers of req.  During
482		   locked state, there mustn't be any filesystem
483		   operation (e.g. page fault), since that could lead
484		   to deadlock */
485		spin_unlock(&fc->lock);
486		wait_event(req->waitq, !req->locked);
487		spin_lock(&fc->lock);
488	}
489}
490
491static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
492{
493	BUG_ON(req->background);
494	spin_lock(&fc->lock);
495	if (!fc->connected)
496		req->out.h.error = -ENOTCONN;
497	else if (fc->conn_error)
498		req->out.h.error = -ECONNREFUSED;
499	else {
500		req->in.h.unique = fuse_get_unique(fc);
501		queue_request(fc, req);
502		/* acquire extra reference, since request is still needed
503		   after request_end() */
504		__fuse_get_request(req);
505
506		request_wait_answer(fc, req);
507	}
508	spin_unlock(&fc->lock);
509}
510
511void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
512{
513	req->isreply = 1;
514	__fuse_request_send(fc, req);
515}
516EXPORT_SYMBOL_GPL(fuse_request_send);
517
518static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
519					    struct fuse_req *req)
520{
521	BUG_ON(!req->background);
522	fc->num_background++;
523	if (fc->num_background == fc->max_background)
524		fc->blocked = 1;
525	if (fc->num_background == fc->congestion_threshold &&
526	    fc->bdi_initialized) {
527		set_bdi_congested(&fc->bdi, BLK_RW_SYNC);
528		set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
529	}
530	list_add_tail(&req->list, &fc->bg_queue);
531	flush_bg_queue(fc);
532}
533
534static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
535{
536	spin_lock(&fc->lock);
537	if (fc->connected) {
538		fuse_request_send_nowait_locked(fc, req);
539		spin_unlock(&fc->lock);
540	} else {
541		req->out.h.error = -ENOTCONN;
542		request_end(fc, req);
543	}
544}
545
546void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
547{
548	req->isreply = 1;
549	fuse_request_send_nowait(fc, req);
550}
551EXPORT_SYMBOL_GPL(fuse_request_send_background);
552
553static int fuse_request_send_notify_reply(struct fuse_conn *fc,
554					  struct fuse_req *req, u64 unique)
555{
556	int err = -ENODEV;
557
558	req->isreply = 0;
559	req->in.h.unique = unique;
560	spin_lock(&fc->lock);
561	if (fc->connected) {
562		queue_request(fc, req);
563		err = 0;
564	}
565	spin_unlock(&fc->lock);
566
567	return err;
568}
569
570/*
571 * Called under fc->lock
572 *
573 * fc->connected must have been checked previously
574 */
575void fuse_request_send_background_locked(struct fuse_conn *fc,
576					 struct fuse_req *req)
577{
578	req->isreply = 1;
579	fuse_request_send_nowait_locked(fc, req);
580}
581
582void fuse_force_forget(struct file *file, u64 nodeid)
583{
584	struct inode *inode = file_inode(file);
585	struct fuse_conn *fc = get_fuse_conn(inode);
586	struct fuse_req *req;
587	struct fuse_forget_in inarg;
588
589	memset(&inarg, 0, sizeof(inarg));
590	inarg.nlookup = 1;
591	req = fuse_get_req_nofail_nopages(fc, file);
592	req->in.h.opcode = FUSE_FORGET;
593	req->in.h.nodeid = nodeid;
594	req->in.numargs = 1;
595	req->in.args[0].size = sizeof(inarg);
596	req->in.args[0].value = &inarg;
597	req->isreply = 0;
598	__fuse_request_send(fc, req);
599	/* ignore errors */
600	fuse_put_request(fc, req);
601}
602
603/*
604 * Lock the request.  Up to the next unlock_request() there mustn't be
605 * anything that could cause a page-fault.  If the request was already
606 * aborted bail out.
607 */
608static int lock_request(struct fuse_conn *fc, struct fuse_req *req)
609{
610	int err = 0;
611	if (req) {
612		spin_lock(&fc->lock);
613		if (req->aborted)
614			err = -ENOENT;
615		else
616			req->locked = 1;
617		spin_unlock(&fc->lock);
618	}
619	return err;
620}
621
622/*
623 * Unlock request.  If it was aborted during being locked, the
624 * requester thread is currently waiting for it to be unlocked, so
625 * wake it up.
626 */
627static void unlock_request(struct fuse_conn *fc, struct fuse_req *req)
628{
629	if (req) {
630		spin_lock(&fc->lock);
631		req->locked = 0;
632		if (req->aborted)
633			wake_up(&req->waitq);
634		spin_unlock(&fc->lock);
635	}
636}
637
638struct fuse_copy_state {
639	struct fuse_conn *fc;
640	int write;
641	struct fuse_req *req;
642	const struct iovec *iov;
643	struct pipe_buffer *pipebufs;
644	struct pipe_buffer *currbuf;
645	struct pipe_inode_info *pipe;
646	unsigned long nr_segs;
647	unsigned long seglen;
648	unsigned long addr;
649	struct page *pg;
650	unsigned len;
651	unsigned offset;
652	unsigned move_pages:1;
653};
654
655static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
656			   int write,
657			   const struct iovec *iov, unsigned long nr_segs)
658{
659	memset(cs, 0, sizeof(*cs));
660	cs->fc = fc;
661	cs->write = write;
662	cs->iov = iov;
663	cs->nr_segs = nr_segs;
664}
665
666/* Unmap and put previous page of userspace buffer */
667static void fuse_copy_finish(struct fuse_copy_state *cs)
668{
669	if (cs->currbuf) {
670		struct pipe_buffer *buf = cs->currbuf;
671
672		if (cs->write)
673			buf->len = PAGE_SIZE - cs->len;
674		cs->currbuf = NULL;
675	} else if (cs->pg) {
676		if (cs->write) {
677			flush_dcache_page(cs->pg);
678			set_page_dirty_lock(cs->pg);
679		}
680		put_page(cs->pg);
681	}
682	cs->pg = NULL;
683}
684
685/*
686 * Get another pagefull of userspace buffer, and map it to kernel
687 * address space, and lock request
688 */
689static int fuse_copy_fill(struct fuse_copy_state *cs)
690{
691	struct page *page;
692	int err;
693
694	unlock_request(cs->fc, cs->req);
695	fuse_copy_finish(cs);
696	if (cs->pipebufs) {
697		struct pipe_buffer *buf = cs->pipebufs;
698
699		if (!cs->write) {
700			err = buf->ops->confirm(cs->pipe, buf);
701			if (err)
702				return err;
703
704			BUG_ON(!cs->nr_segs);
705			cs->currbuf = buf;
706			cs->pg = buf->page;
707			cs->offset = buf->offset;
708			cs->len = buf->len;
709			cs->pipebufs++;
710			cs->nr_segs--;
711		} else {
712			if (cs->nr_segs == cs->pipe->buffers)
713				return -EIO;
714
715			page = alloc_page(GFP_HIGHUSER);
716			if (!page)
717				return -ENOMEM;
718
719			buf->page = page;
720			buf->offset = 0;
721			buf->len = 0;
722
723			cs->currbuf = buf;
724			cs->pg = page;
725			cs->offset = 0;
726			cs->len = PAGE_SIZE;
727			cs->pipebufs++;
728			cs->nr_segs++;
729		}
730	} else {
731		if (!cs->seglen) {
732			BUG_ON(!cs->nr_segs);
733			cs->seglen = cs->iov[0].iov_len;
734			cs->addr = (unsigned long) cs->iov[0].iov_base;
735			cs->iov++;
736			cs->nr_segs--;
737		}
738		err = get_user_pages_fast(cs->addr, 1, cs->write, &page);
739		if (err < 0)
740			return err;
741		BUG_ON(err != 1);
742		cs->pg = page;
743		cs->offset = cs->addr % PAGE_SIZE;
744		cs->len = min(PAGE_SIZE - cs->offset, cs->seglen);
745		cs->seglen -= cs->len;
746		cs->addr += cs->len;
747	}
748
749	return lock_request(cs->fc, cs->req);
750}
751
752/* Do as much copy to/from userspace buffer as we can */
753static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
754{
755	unsigned ncpy = min(*size, cs->len);
756	if (val) {
757		void *pgaddr = kmap_atomic(cs->pg);
758		void *buf = pgaddr + cs->offset;
759
760		if (cs->write)
761			memcpy(buf, *val, ncpy);
762		else
763			memcpy(*val, buf, ncpy);
764
765		kunmap_atomic(pgaddr);
766		*val += ncpy;
767	}
768	*size -= ncpy;
769	cs->len -= ncpy;
770	cs->offset += ncpy;
771	return ncpy;
772}
773
774static int fuse_check_page(struct page *page)
775{
776	if (page_mapcount(page) ||
777	    page->mapping != NULL ||
778	    page_count(page) != 1 ||
779	    (page->flags & PAGE_FLAGS_CHECK_AT_PREP &
780	     ~(1 << PG_locked |
781	       1 << PG_referenced |
782	       1 << PG_uptodate |
783	       1 << PG_lru |
784	       1 << PG_active |
785	       1 << PG_reclaim))) {
786		printk(KERN_WARNING "fuse: trying to steal weird page\n");
787		printk(KERN_WARNING "  page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping);
788		return 1;
789	}
790	return 0;
791}
792
793static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
794{
795	int err;
796	struct page *oldpage = *pagep;
797	struct page *newpage;
798	struct pipe_buffer *buf = cs->pipebufs;
799
800	unlock_request(cs->fc, cs->req);
801	fuse_copy_finish(cs);
802
803	err = buf->ops->confirm(cs->pipe, buf);
804	if (err)
805		return err;
806
807	BUG_ON(!cs->nr_segs);
808	cs->currbuf = buf;
809	cs->len = buf->len;
810	cs->pipebufs++;
811	cs->nr_segs--;
812
813	if (cs->len != PAGE_SIZE)
814		goto out_fallback;
815
816	if (buf->ops->steal(cs->pipe, buf) != 0)
817		goto out_fallback;
818
819	newpage = buf->page;
820
821	if (WARN_ON(!PageUptodate(newpage)))
822		return -EIO;
823
824	ClearPageMappedToDisk(newpage);
825
826	if (fuse_check_page(newpage) != 0)
827		goto out_fallback_unlock;
828
829	/*
830	 * This is a new and locked page, it shouldn't be mapped or
831	 * have any special flags on it
832	 */
833	if (WARN_ON(page_mapped(oldpage)))
834		goto out_fallback_unlock;
835	if (WARN_ON(page_has_private(oldpage)))
836		goto out_fallback_unlock;
837	if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage)))
838		goto out_fallback_unlock;
839	if (WARN_ON(PageMlocked(oldpage)))
840		goto out_fallback_unlock;
841
842	err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL);
843	if (err) {
844		unlock_page(newpage);
845		return err;
846	}
847
848	page_cache_get(newpage);
849
850	if (!(buf->flags & PIPE_BUF_FLAG_LRU))
851		lru_cache_add_file(newpage);
852
853	err = 0;
854	spin_lock(&cs->fc->lock);
855	if (cs->req->aborted)
856		err = -ENOENT;
857	else
858		*pagep = newpage;
859	spin_unlock(&cs->fc->lock);
860
861	if (err) {
862		unlock_page(newpage);
863		page_cache_release(newpage);
864		return err;
865	}
866
867	unlock_page(oldpage);
868	page_cache_release(oldpage);
869	cs->len = 0;
870
871	return 0;
872
873out_fallback_unlock:
874	unlock_page(newpage);
875out_fallback:
876	cs->pg = buf->page;
877	cs->offset = buf->offset;
878
879	err = lock_request(cs->fc, cs->req);
880	if (err)
881		return err;
882
883	return 1;
884}
885
886static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
887			 unsigned offset, unsigned count)
888{
889	struct pipe_buffer *buf;
890
891	if (cs->nr_segs == cs->pipe->buffers)
892		return -EIO;
893
894	unlock_request(cs->fc, cs->req);
895	fuse_copy_finish(cs);
896
897	buf = cs->pipebufs;
898	page_cache_get(page);
899	buf->page = page;
900	buf->offset = offset;
901	buf->len = count;
902
903	cs->pipebufs++;
904	cs->nr_segs++;
905	cs->len = 0;
906
907	return 0;
908}
909
910/*
911 * Copy a page in the request to/from the userspace buffer.  Must be
912 * done atomically
913 */
914static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
915			  unsigned offset, unsigned count, int zeroing)
916{
917	int err;
918	struct page *page = *pagep;
919
920	if (page && zeroing && count < PAGE_SIZE)
921		clear_highpage(page);
922
923	while (count) {
924		if (cs->write && cs->pipebufs && page) {
925			return fuse_ref_page(cs, page, offset, count);
926		} else if (!cs->len) {
927			if (cs->move_pages && page &&
928			    offset == 0 && count == PAGE_SIZE) {
929				err = fuse_try_move_page(cs, pagep);
930				if (err <= 0)
931					return err;
932			} else {
933				err = fuse_copy_fill(cs);
934				if (err)
935					return err;
936			}
937		}
938		if (page) {
939			void *mapaddr = kmap_atomic(page);
940			void *buf = mapaddr + offset;
941			offset += fuse_copy_do(cs, &buf, &count);
942			kunmap_atomic(mapaddr);
943		} else
944			offset += fuse_copy_do(cs, NULL, &count);
945	}
946	if (page && !cs->write)
947		flush_dcache_page(page);
948	return 0;
949}
950
951/* Copy pages in the request to/from userspace buffer */
952static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
953			   int zeroing)
954{
955	unsigned i;
956	struct fuse_req *req = cs->req;
957
958	for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
959		int err;
960		unsigned offset = req->page_descs[i].offset;
961		unsigned count = min(nbytes, req->page_descs[i].length);
962
963		err = fuse_copy_page(cs, &req->pages[i], offset, count,
964				     zeroing);
965		if (err)
966			return err;
967
968		nbytes -= count;
969	}
970	return 0;
971}
972
973/* Copy a single argument in the request to/from userspace buffer */
974static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
975{
976	while (size) {
977		if (!cs->len) {
978			int err = fuse_copy_fill(cs);
979			if (err)
980				return err;
981		}
982		fuse_copy_do(cs, &val, &size);
983	}
984	return 0;
985}
986
987/* Copy request arguments to/from userspace buffer */
988static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
989			  unsigned argpages, struct fuse_arg *args,
990			  int zeroing)
991{
992	int err = 0;
993	unsigned i;
994
995	for (i = 0; !err && i < numargs; i++)  {
996		struct fuse_arg *arg = &args[i];
997		if (i == numargs - 1 && argpages)
998			err = fuse_copy_pages(cs, arg->size, zeroing);
999		else
1000			err = fuse_copy_one(cs, arg->value, arg->size);
1001	}
1002	return err;
1003}
1004
1005static int forget_pending(struct fuse_conn *fc)
1006{
1007	return fc->forget_list_head.next != NULL;
1008}
1009
1010static int request_pending(struct fuse_conn *fc)
1011{
1012	return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) ||
1013		forget_pending(fc);
1014}
1015
1016/* Wait until a request is available on the pending list */
1017static void request_wait(struct fuse_conn *fc)
1018__releases(fc->lock)
1019__acquires(fc->lock)
1020{
1021	DECLARE_WAITQUEUE(wait, current);
1022
1023	add_wait_queue_exclusive(&fc->waitq, &wait);
1024	while (fc->connected && !request_pending(fc)) {
1025		set_current_state(TASK_INTERRUPTIBLE);
1026		if (signal_pending(current))
1027			break;
1028
1029		spin_unlock(&fc->lock);
1030		schedule();
1031		spin_lock(&fc->lock);
1032	}
1033	set_current_state(TASK_RUNNING);
1034	remove_wait_queue(&fc->waitq, &wait);
1035}
1036
1037/*
1038 * Transfer an interrupt request to userspace
1039 *
1040 * Unlike other requests this is assembled on demand, without a need
1041 * to allocate a separate fuse_req structure.
1042 *
1043 * Called with fc->lock held, releases it
1044 */
1045static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
1046			       size_t nbytes, struct fuse_req *req)
1047__releases(fc->lock)
1048{
1049	struct fuse_in_header ih;
1050	struct fuse_interrupt_in arg;
1051	unsigned reqsize = sizeof(ih) + sizeof(arg);
1052	int err;
1053
1054	list_del_init(&req->intr_entry);
1055	req->intr_unique = fuse_get_unique(fc);
1056	memset(&ih, 0, sizeof(ih));
1057	memset(&arg, 0, sizeof(arg));
1058	ih.len = reqsize;
1059	ih.opcode = FUSE_INTERRUPT;
1060	ih.unique = req->intr_unique;
1061	arg.unique = req->in.h.unique;
1062
1063	spin_unlock(&fc->lock);
1064	if (nbytes < reqsize)
1065		return -EINVAL;
1066
1067	err = fuse_copy_one(cs, &ih, sizeof(ih));
1068	if (!err)
1069		err = fuse_copy_one(cs, &arg, sizeof(arg));
1070	fuse_copy_finish(cs);
1071
1072	return err ? err : reqsize;
1073}
1074
1075static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc,
1076					       unsigned max,
1077					       unsigned *countp)
1078{
1079	struct fuse_forget_link *head = fc->forget_list_head.next;
1080	struct fuse_forget_link **newhead = &head;
1081	unsigned count;
1082
1083	for (count = 0; *newhead != NULL && count < max; count++)
1084		newhead = &(*newhead)->next;
1085
1086	fc->forget_list_head.next = *newhead;
1087	*newhead = NULL;
1088	if (fc->forget_list_head.next == NULL)
1089		fc->forget_list_tail = &fc->forget_list_head;
1090
1091	if (countp != NULL)
1092		*countp = count;
1093
1094	return head;
1095}
1096
1097static int fuse_read_single_forget(struct fuse_conn *fc,
1098				   struct fuse_copy_state *cs,
1099				   size_t nbytes)
1100__releases(fc->lock)
1101{
1102	int err;
1103	struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL);
1104	struct fuse_forget_in arg = {
1105		.nlookup = forget->forget_one.nlookup,
1106	};
1107	struct fuse_in_header ih = {
1108		.opcode = FUSE_FORGET,
1109		.nodeid = forget->forget_one.nodeid,
1110		.unique = fuse_get_unique(fc),
1111		.len = sizeof(ih) + sizeof(arg),
1112	};
1113
1114	spin_unlock(&fc->lock);
1115	kfree(forget);
1116	if (nbytes < ih.len)
1117		return -EINVAL;
1118
1119	err = fuse_copy_one(cs, &ih, sizeof(ih));
1120	if (!err)
1121		err = fuse_copy_one(cs, &arg, sizeof(arg));
1122	fuse_copy_finish(cs);
1123
1124	if (err)
1125		return err;
1126
1127	return ih.len;
1128}
1129
1130static int fuse_read_batch_forget(struct fuse_conn *fc,
1131				   struct fuse_copy_state *cs, size_t nbytes)
1132__releases(fc->lock)
1133{
1134	int err;
1135	unsigned max_forgets;
1136	unsigned count;
1137	struct fuse_forget_link *head;
1138	struct fuse_batch_forget_in arg = { .count = 0 };
1139	struct fuse_in_header ih = {
1140		.opcode = FUSE_BATCH_FORGET,
1141		.unique = fuse_get_unique(fc),
1142		.len = sizeof(ih) + sizeof(arg),
1143	};
1144
1145	if (nbytes < ih.len) {
1146		spin_unlock(&fc->lock);
1147		return -EINVAL;
1148	}
1149
1150	max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
1151	head = dequeue_forget(fc, max_forgets, &count);
1152	spin_unlock(&fc->lock);
1153
1154	arg.count = count;
1155	ih.len += count * sizeof(struct fuse_forget_one);
1156	err = fuse_copy_one(cs, &ih, sizeof(ih));
1157	if (!err)
1158		err = fuse_copy_one(cs, &arg, sizeof(arg));
1159
1160	while (head) {
1161		struct fuse_forget_link *forget = head;
1162
1163		if (!err) {
1164			err = fuse_copy_one(cs, &forget->forget_one,
1165					    sizeof(forget->forget_one));
1166		}
1167		head = forget->next;
1168		kfree(forget);
1169	}
1170
1171	fuse_copy_finish(cs);
1172
1173	if (err)
1174		return err;
1175
1176	return ih.len;
1177}
1178
1179static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs,
1180			    size_t nbytes)
1181__releases(fc->lock)
1182{
1183	if (fc->minor < 16 || fc->forget_list_head.next->next == NULL)
1184		return fuse_read_single_forget(fc, cs, nbytes);
1185	else
1186		return fuse_read_batch_forget(fc, cs, nbytes);
1187}
1188
1189/*
1190 * Read a single request into the userspace filesystem's buffer.  This
1191 * function waits until a request is available, then removes it from
1192 * the pending list and copies request data to userspace buffer.  If
1193 * no reply is needed (FORGET) or request has been aborted or there
1194 * was an error during the copying then it's finished by calling
1195 * request_end().  Otherwise add it to the processing list, and set
1196 * the 'sent' flag.
1197 */
1198static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
1199				struct fuse_copy_state *cs, size_t nbytes)
1200{
1201	int err;
1202	struct fuse_req *req;
1203	struct fuse_in *in;
1204	unsigned reqsize;
1205
1206 restart:
1207	spin_lock(&fc->lock);
1208	err = -EAGAIN;
1209	if ((file->f_flags & O_NONBLOCK) && fc->connected &&
1210	    !request_pending(fc))
1211		goto err_unlock;
1212
1213	request_wait(fc);
1214	err = -ENODEV;
1215	if (!fc->connected)
1216		goto err_unlock;
1217	err = -ERESTARTSYS;
1218	if (!request_pending(fc))
1219		goto err_unlock;
1220
1221	if (!list_empty(&fc->interrupts)) {
1222		req = list_entry(fc->interrupts.next, struct fuse_req,
1223				 intr_entry);
1224		return fuse_read_interrupt(fc, cs, nbytes, req);
1225	}
1226
1227	if (forget_pending(fc)) {
1228		if (list_empty(&fc->pending) || fc->forget_batch-- > 0)
1229			return fuse_read_forget(fc, cs, nbytes);
1230
1231		if (fc->forget_batch <= -8)
1232			fc->forget_batch = 16;
1233	}
1234
1235	req = list_entry(fc->pending.next, struct fuse_req, list);
1236	req->state = FUSE_REQ_READING;
1237	list_move(&req->list, &fc->io);
1238
1239	in = &req->in;
1240	reqsize = in->h.len;
1241	/* If request is too large, reply with an error and restart the read */
1242	if (nbytes < reqsize) {
1243		req->out.h.error = -EIO;
1244		/* SETXATTR is special, since it may contain too large data */
1245		if (in->h.opcode == FUSE_SETXATTR)
1246			req->out.h.error = -E2BIG;
1247		request_end(fc, req);
1248		goto restart;
1249	}
1250	spin_unlock(&fc->lock);
1251	cs->req = req;
1252	err = fuse_copy_one(cs, &in->h, sizeof(in->h));
1253	if (!err)
1254		err = fuse_copy_args(cs, in->numargs, in->argpages,
1255				     (struct fuse_arg *) in->args, 0);
1256	fuse_copy_finish(cs);
1257	spin_lock(&fc->lock);
1258	req->locked = 0;
1259	if (req->aborted) {
1260		request_end(fc, req);
1261		return -ENODEV;
1262	}
1263	if (err) {
1264		req->out.h.error = -EIO;
1265		request_end(fc, req);
1266		return err;
1267	}
1268	if (!req->isreply)
1269		request_end(fc, req);
1270	else {
1271		req->state = FUSE_REQ_SENT;
1272		list_move_tail(&req->list, &fc->processing);
1273		if (req->interrupted)
1274			queue_interrupt(fc, req);
1275		spin_unlock(&fc->lock);
1276	}
1277	return reqsize;
1278
1279 err_unlock:
1280	spin_unlock(&fc->lock);
1281	return err;
1282}
1283
1284static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
1285			      unsigned long nr_segs, loff_t pos)
1286{
1287	struct fuse_copy_state cs;
1288	struct file *file = iocb->ki_filp;
1289	struct fuse_conn *fc = fuse_get_conn(file);
1290	if (!fc)
1291		return -EPERM;
1292
1293	fuse_copy_init(&cs, fc, 1, iov, nr_segs);
1294
1295	return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
1296}
1297
1298static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
1299				    struct pipe_inode_info *pipe,
1300				    size_t len, unsigned int flags)
1301{
1302	int ret;
1303	int page_nr = 0;
1304	int do_wakeup = 0;
1305	struct pipe_buffer *bufs;
1306	struct fuse_copy_state cs;
1307	struct fuse_conn *fc = fuse_get_conn(in);
1308	if (!fc)
1309		return -EPERM;
1310
1311	bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
1312	if (!bufs)
1313		return -ENOMEM;
1314
1315	fuse_copy_init(&cs, fc, 1, NULL, 0);
1316	cs.pipebufs = bufs;
1317	cs.pipe = pipe;
1318	ret = fuse_dev_do_read(fc, in, &cs, len);
1319	if (ret < 0)
1320		goto out;
1321
1322	ret = 0;
1323	pipe_lock(pipe);
1324
1325	if (!pipe->readers) {
1326		send_sig(SIGPIPE, current, 0);
1327		if (!ret)
1328			ret = -EPIPE;
1329		goto out_unlock;
1330	}
1331
1332	if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
1333		ret = -EIO;
1334		goto out_unlock;
1335	}
1336
1337	while (page_nr < cs.nr_segs) {
1338		int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
1339		struct pipe_buffer *buf = pipe->bufs + newbuf;
1340
1341		buf->page = bufs[page_nr].page;
1342		buf->offset = bufs[page_nr].offset;
1343		buf->len = bufs[page_nr].len;
1344		/*
1345		 * Need to be careful about this.  Having buf->ops in module
1346		 * code can Oops if the buffer persists after module unload.
1347		 */
1348		buf->ops = &nosteal_pipe_buf_ops;
1349
1350		pipe->nrbufs++;
1351		page_nr++;
1352		ret += buf->len;
1353
1354		if (pipe->files)
1355			do_wakeup = 1;
1356	}
1357
1358out_unlock:
1359	pipe_unlock(pipe);
1360
1361	if (do_wakeup) {
1362		smp_mb();
1363		if (waitqueue_active(&pipe->wait))
1364			wake_up_interruptible(&pipe->wait);
1365		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
1366	}
1367
1368out:
1369	for (; page_nr < cs.nr_segs; page_nr++)
1370		page_cache_release(bufs[page_nr].page);
1371
1372	kfree(bufs);
1373	return ret;
1374}
1375
1376static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
1377			    struct fuse_copy_state *cs)
1378{
1379	struct fuse_notify_poll_wakeup_out outarg;
1380	int err = -EINVAL;
1381
1382	if (size != sizeof(outarg))
1383		goto err;
1384
1385	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
1386	if (err)
1387		goto err;
1388
1389	fuse_copy_finish(cs);
1390	return fuse_notify_poll_wakeup(fc, &outarg);
1391
1392err:
1393	fuse_copy_finish(cs);
1394	return err;
1395}
1396
1397static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
1398				   struct fuse_copy_state *cs)
1399{
1400	struct fuse_notify_inval_inode_out outarg;
1401	int err = -EINVAL;
1402
1403	if (size != sizeof(outarg))
1404		goto err;
1405
1406	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
1407	if (err)
1408		goto err;
1409	fuse_copy_finish(cs);
1410
1411	down_read(&fc->killsb);
1412	err = -ENOENT;
1413	if (fc->sb) {
1414		err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
1415					       outarg.off, outarg.len);
1416	}
1417	up_read(&fc->killsb);
1418	return err;
1419
1420err:
1421	fuse_copy_finish(cs);
1422	return err;
1423}
1424
1425static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
1426				   struct fuse_copy_state *cs)
1427{
1428	struct fuse_notify_inval_entry_out outarg;
1429	int err = -ENOMEM;
1430	char *buf;
1431	struct qstr name;
1432
1433	buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
1434	if (!buf)
1435		goto err;
1436
1437	err = -EINVAL;
1438	if (size < sizeof(outarg))
1439		goto err;
1440
1441	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
1442	if (err)
1443		goto err;
1444
1445	err = -ENAMETOOLONG;
1446	if (outarg.namelen > FUSE_NAME_MAX)
1447		goto err;
1448
1449	err = -EINVAL;
1450	if (size != sizeof(outarg) + outarg.namelen + 1)
1451		goto err;
1452
1453	name.name = buf;
1454	name.len = outarg.namelen;
1455	err = fuse_copy_one(cs, buf, outarg.namelen + 1);
1456	if (err)
1457		goto err;
1458	fuse_copy_finish(cs);
1459	buf[outarg.namelen] = 0;
1460	name.hash = full_name_hash(name.name, name.len);
1461
1462	down_read(&fc->killsb);
1463	err = -ENOENT;
1464	if (fc->sb)
1465		err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name);
1466	up_read(&fc->killsb);
1467	kfree(buf);
1468	return err;
1469
1470err:
1471	kfree(buf);
1472	fuse_copy_finish(cs);
1473	return err;
1474}
1475
1476static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
1477			      struct fuse_copy_state *cs)
1478{
1479	struct fuse_notify_delete_out outarg;
1480	int err = -ENOMEM;
1481	char *buf;
1482	struct qstr name;
1483
1484	buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
1485	if (!buf)
1486		goto err;
1487
1488	err = -EINVAL;
1489	if (size < sizeof(outarg))
1490		goto err;
1491
1492	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
1493	if (err)
1494		goto err;
1495
1496	err = -ENAMETOOLONG;
1497	if (outarg.namelen > FUSE_NAME_MAX)
1498		goto err;
1499
1500	err = -EINVAL;
1501	if (size != sizeof(outarg) + outarg.namelen + 1)
1502		goto err;
1503
1504	name.name = buf;
1505	name.len = outarg.namelen;
1506	err = fuse_copy_one(cs, buf, outarg.namelen + 1);
1507	if (err)
1508		goto err;
1509	fuse_copy_finish(cs);
1510	buf[outarg.namelen] = 0;
1511	name.hash = full_name_hash(name.name, name.len);
1512
1513	down_read(&fc->killsb);
1514	err = -ENOENT;
1515	if (fc->sb)
1516		err = fuse_reverse_inval_entry(fc->sb, outarg.parent,
1517					       outarg.child, &name);
1518	up_read(&fc->killsb);
1519	kfree(buf);
1520	return err;
1521
1522err:
1523	kfree(buf);
1524	fuse_copy_finish(cs);
1525	return err;
1526}
1527
1528static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
1529			     struct fuse_copy_state *cs)
1530{
1531	struct fuse_notify_store_out outarg;
1532	struct inode *inode;
1533	struct address_space *mapping;
1534	u64 nodeid;
1535	int err;
1536	pgoff_t index;
1537	unsigned int offset;
1538	unsigned int num;
1539	loff_t file_size;
1540	loff_t end;
1541
1542	err = -EINVAL;
1543	if (size < sizeof(outarg))
1544		goto out_finish;
1545
1546	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
1547	if (err)
1548		goto out_finish;
1549
1550	err = -EINVAL;
1551	if (size - sizeof(outarg) != outarg.size)
1552		goto out_finish;
1553
1554	nodeid = outarg.nodeid;
1555
1556	down_read(&fc->killsb);
1557
1558	err = -ENOENT;
1559	if (!fc->sb)
1560		goto out_up_killsb;
1561
1562	inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
1563	if (!inode)
1564		goto out_up_killsb;
1565
1566	mapping = inode->i_mapping;
1567	index = outarg.offset >> PAGE_CACHE_SHIFT;
1568	offset = outarg.offset & ~PAGE_CACHE_MASK;
1569	file_size = i_size_read(inode);
1570	end = outarg.offset + outarg.size;
1571	if (end > file_size) {
1572		file_size = end;
1573		fuse_write_update_size(inode, file_size);
1574	}
1575
1576	num = outarg.size;
1577	while (num) {
1578		struct page *page;
1579		unsigned int this_num;
1580
1581		err = -ENOMEM;
1582		page = find_or_create_page(mapping, index,
1583					   mapping_gfp_mask(mapping));
1584		if (!page)
1585			goto out_iput;
1586
1587		this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
1588		err = fuse_copy_page(cs, &page, offset, this_num, 0);
1589		if (!err && offset == 0 &&
1590		    (this_num == PAGE_CACHE_SIZE || file_size == end))
1591			SetPageUptodate(page);
1592		unlock_page(page);
1593		page_cache_release(page);
1594
1595		if (err)
1596			goto out_iput;
1597
1598		num -= this_num;
1599		offset = 0;
1600		index++;
1601	}
1602
1603	err = 0;
1604
1605out_iput:
1606	iput(inode);
1607out_up_killsb:
1608	up_read(&fc->killsb);
1609out_finish:
1610	fuse_copy_finish(cs);
1611	return err;
1612}
1613
1614static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
1615{
1616	release_pages(req->pages, req->num_pages, false);
1617}
1618
1619static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
1620			 struct fuse_notify_retrieve_out *outarg)
1621{
1622	int err;
1623	struct address_space *mapping = inode->i_mapping;
1624	struct fuse_req *req;
1625	pgoff_t index;
1626	loff_t file_size;
1627	unsigned int num;
1628	unsigned int offset;
1629	size_t total_len = 0;
1630	int num_pages;
1631
1632	offset = outarg->offset & ~PAGE_CACHE_MASK;
1633	file_size = i_size_read(inode);
1634
1635	num = outarg->size;
1636	if (outarg->offset > file_size)
1637		num = 0;
1638	else if (outarg->offset + num > file_size)
1639		num = file_size - outarg->offset;
1640
1641	num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
1642	num_pages = min(num_pages, FUSE_MAX_PAGES_PER_REQ);
1643
1644	req = fuse_get_req(fc, num_pages);
1645	if (IS_ERR(req))
1646		return PTR_ERR(req);
1647
1648	req->in.h.opcode = FUSE_NOTIFY_REPLY;
1649	req->in.h.nodeid = outarg->nodeid;
1650	req->in.numargs = 2;
1651	req->in.argpages = 1;
1652	req->page_descs[0].offset = offset;
1653	req->end = fuse_retrieve_end;
1654
1655	index = outarg->offset >> PAGE_CACHE_SHIFT;
1656
1657	while (num && req->num_pages < num_pages) {
1658		struct page *page;
1659		unsigned int this_num;
1660
1661		page = find_get_page(mapping, index);
1662		if (!page)
1663			break;
1664
1665		this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
1666		req->pages[req->num_pages] = page;
1667		req->page_descs[req->num_pages].length = this_num;
1668		req->num_pages++;
1669
1670		offset = 0;
1671		num -= this_num;
1672		total_len += this_num;
1673		index++;
1674	}
1675	req->misc.retrieve_in.offset = outarg->offset;
1676	req->misc.retrieve_in.size = total_len;
1677	req->in.args[0].size = sizeof(req->misc.retrieve_in);
1678	req->in.args[0].value = &req->misc.retrieve_in;
1679	req->in.args[1].size = total_len;
1680
1681	err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique);
1682	if (err)
1683		fuse_retrieve_end(fc, req);
1684
1685	return err;
1686}
1687
1688static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
1689				struct fuse_copy_state *cs)
1690{
1691	struct fuse_notify_retrieve_out outarg;
1692	struct inode *inode;
1693	int err;
1694
1695	err = -EINVAL;
1696	if (size != sizeof(outarg))
1697		goto copy_finish;
1698
1699	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
1700	if (err)
1701		goto copy_finish;
1702
1703	fuse_copy_finish(cs);
1704
1705	down_read(&fc->killsb);
1706	err = -ENOENT;
1707	if (fc->sb) {
1708		u64 nodeid = outarg.nodeid;
1709
1710		inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
1711		if (inode) {
1712			err = fuse_retrieve(fc, inode, &outarg);
1713			iput(inode);
1714		}
1715	}
1716	up_read(&fc->killsb);
1717
1718	return err;
1719
1720copy_finish:
1721	fuse_copy_finish(cs);
1722	return err;
1723}
1724
1725static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
1726		       unsigned int size, struct fuse_copy_state *cs)
1727{
1728	switch (code) {
1729	case FUSE_NOTIFY_POLL:
1730		return fuse_notify_poll(fc, size, cs);
1731
1732	case FUSE_NOTIFY_INVAL_INODE:
1733		return fuse_notify_inval_inode(fc, size, cs);
1734
1735	case FUSE_NOTIFY_INVAL_ENTRY:
1736		return fuse_notify_inval_entry(fc, size, cs);
1737
1738	case FUSE_NOTIFY_STORE:
1739		return fuse_notify_store(fc, size, cs);
1740
1741	case FUSE_NOTIFY_RETRIEVE:
1742		return fuse_notify_retrieve(fc, size, cs);
1743
1744	case FUSE_NOTIFY_DELETE:
1745		return fuse_notify_delete(fc, size, cs);
1746
1747	default:
1748		fuse_copy_finish(cs);
1749		return -EINVAL;
1750	}
1751}
1752
1753/* Look up request on processing list by unique ID */
1754static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
1755{
1756	struct fuse_req *req;
1757
1758	list_for_each_entry(req, &fc->processing, list) {
1759		if (req->in.h.unique == unique || req->intr_unique == unique)
1760			return req;
1761	}
1762	return NULL;
1763}
1764
1765static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
1766			 unsigned nbytes)
1767{
1768	unsigned reqsize = sizeof(struct fuse_out_header);
1769
1770	if (out->h.error)
1771		return nbytes != reqsize ? -EINVAL : 0;
1772
1773	reqsize += len_args(out->numargs, out->args);
1774
1775	if (reqsize < nbytes || (reqsize > nbytes && !out->argvar))
1776		return -EINVAL;
1777	else if (reqsize > nbytes) {
1778		struct fuse_arg *lastarg = &out->args[out->numargs-1];
1779		unsigned diffsize = reqsize - nbytes;
1780		if (diffsize > lastarg->size)
1781			return -EINVAL;
1782		lastarg->size -= diffsize;
1783	}
1784	return fuse_copy_args(cs, out->numargs, out->argpages, out->args,
1785			      out->page_zeroing);
1786}
1787
1788/*
1789 * Write a single reply to a request.  First the header is copied from
1790 * the write buffer.  The request is then searched on the processing
1791 * list by the unique ID found in the header.  If found, then remove
1792 * it from the list and copy the rest of the buffer to the request.
1793 * The request is finished by calling request_end()
1794 */
1795static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
1796				 struct fuse_copy_state *cs, size_t nbytes)
1797{
1798	int err;
1799	struct fuse_req *req;
1800	struct fuse_out_header oh;
1801
1802	if (nbytes < sizeof(struct fuse_out_header))
1803		return -EINVAL;
1804
1805	err = fuse_copy_one(cs, &oh, sizeof(oh));
1806	if (err)
1807		goto err_finish;
1808
1809	err = -EINVAL;
1810	if (oh.len != nbytes)
1811		goto err_finish;
1812
1813	/*
1814	 * Zero oh.unique indicates unsolicited notification message
1815	 * and error contains notification code.
1816	 */
1817	if (!oh.unique) {
1818		err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
1819		return err ? err : nbytes;
1820	}
1821
1822	err = -EINVAL;
1823	if (oh.error <= -1000 || oh.error > 0)
1824		goto err_finish;
1825
1826	spin_lock(&fc->lock);
1827	err = -ENOENT;
1828	if (!fc->connected)
1829		goto err_unlock;
1830
1831	req = request_find(fc, oh.unique);
1832	if (!req)
1833		goto err_unlock;
1834
1835	if (req->aborted) {
1836		spin_unlock(&fc->lock);
1837		fuse_copy_finish(cs);
1838		spin_lock(&fc->lock);
1839		request_end(fc, req);
1840		return -ENOENT;
1841	}
1842	/* Is it an interrupt reply? */
1843	if (req->intr_unique == oh.unique) {
1844		err = -EINVAL;
1845		if (nbytes != sizeof(struct fuse_out_header))
1846			goto err_unlock;
1847
1848		if (oh.error == -ENOSYS)
1849			fc->no_interrupt = 1;
1850		else if (oh.error == -EAGAIN)
1851			queue_interrupt(fc, req);
1852
1853		spin_unlock(&fc->lock);
1854		fuse_copy_finish(cs);
1855		return nbytes;
1856	}
1857
1858	req->state = FUSE_REQ_WRITING;
1859	list_move(&req->list, &fc->io);
1860	req->out.h = oh;
1861	req->locked = 1;
1862	cs->req = req;
1863	if (!req->out.page_replace)
1864		cs->move_pages = 0;
1865	spin_unlock(&fc->lock);
1866
1867	err = copy_out_args(cs, &req->out, nbytes);
1868	fuse_copy_finish(cs);
1869
1870	spin_lock(&fc->lock);
1871	req->locked = 0;
1872	if (!err) {
1873		if (req->aborted)
1874			err = -ENOENT;
1875	} else if (!req->aborted)
1876		req->out.h.error = -EIO;
1877	request_end(fc, req);
1878
1879	return err ? err : nbytes;
1880
1881 err_unlock:
1882	spin_unlock(&fc->lock);
1883 err_finish:
1884	fuse_copy_finish(cs);
1885	return err;
1886}
1887
1888static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1889			      unsigned long nr_segs, loff_t pos)
1890{
1891	struct fuse_copy_state cs;
1892	struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
1893	if (!fc)
1894		return -EPERM;
1895
1896	fuse_copy_init(&cs, fc, 0, iov, nr_segs);
1897
1898	return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs));
1899}
1900
1901static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
1902				     struct file *out, loff_t *ppos,
1903				     size_t len, unsigned int flags)
1904{
1905	unsigned nbuf;
1906	unsigned idx;
1907	struct pipe_buffer *bufs;
1908	struct fuse_copy_state cs;
1909	struct fuse_conn *fc;
1910	size_t rem;
1911	ssize_t ret;
1912
1913	fc = fuse_get_conn(out);
1914	if (!fc)
1915		return -EPERM;
1916
1917	bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
1918	if (!bufs)
1919		return -ENOMEM;
1920
1921	pipe_lock(pipe);
1922	nbuf = 0;
1923	rem = 0;
1924	for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
1925		rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
1926
1927	ret = -EINVAL;
1928	if (rem < len) {
1929		pipe_unlock(pipe);
1930		goto out;
1931	}
1932
1933	rem = len;
1934	while (rem) {
1935		struct pipe_buffer *ibuf;
1936		struct pipe_buffer *obuf;
1937
1938		BUG_ON(nbuf >= pipe->buffers);
1939		BUG_ON(!pipe->nrbufs);
1940		ibuf = &pipe->bufs[pipe->curbuf];
1941		obuf = &bufs[nbuf];
1942
1943		if (rem >= ibuf->len) {
1944			*obuf = *ibuf;
1945			ibuf->ops = NULL;
1946			pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
1947			pipe->nrbufs--;
1948		} else {
1949			ibuf->ops->get(pipe, ibuf);
1950			*obuf = *ibuf;
1951			obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1952			obuf->len = rem;
1953			ibuf->offset += obuf->len;
1954			ibuf->len -= obuf->len;
1955		}
1956		nbuf++;
1957		rem -= obuf->len;
1958	}
1959	pipe_unlock(pipe);
1960
1961	fuse_copy_init(&cs, fc, 0, NULL, nbuf);
1962	cs.pipebufs = bufs;
1963	cs.pipe = pipe;
1964
1965	if (flags & SPLICE_F_MOVE)
1966		cs.move_pages = 1;
1967
1968	ret = fuse_dev_do_write(fc, &cs, len);
1969
1970	for (idx = 0; idx < nbuf; idx++) {
1971		struct pipe_buffer *buf = &bufs[idx];
1972		buf->ops->release(pipe, buf);
1973	}
1974out:
1975	kfree(bufs);
1976	return ret;
1977}
1978
1979static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
1980{
1981	unsigned mask = POLLOUT | POLLWRNORM;
1982	struct fuse_conn *fc = fuse_get_conn(file);
1983	if (!fc)
1984		return POLLERR;
1985
1986	poll_wait(file, &fc->waitq, wait);
1987
1988	spin_lock(&fc->lock);
1989	if (!fc->connected)
1990		mask = POLLERR;
1991	else if (request_pending(fc))
1992		mask |= POLLIN | POLLRDNORM;
1993	spin_unlock(&fc->lock);
1994
1995	return mask;
1996}
1997
1998/*
1999 * Abort all requests on the given list (pending or processing)
2000 *
2001 * This function releases and reacquires fc->lock
2002 */
2003static void end_requests(struct fuse_conn *fc, struct list_head *head)
2004__releases(fc->lock)
2005__acquires(fc->lock)
2006{
2007	while (!list_empty(head)) {
2008		struct fuse_req *req;
2009		req = list_entry(head->next, struct fuse_req, list);
2010		req->out.h.error = -ECONNABORTED;
2011		request_end(fc, req);
2012		spin_lock(&fc->lock);
2013	}
2014}
2015
2016/*
2017 * Abort requests under I/O
2018 *
2019 * The requests are set to aborted and finished, and the request
2020 * waiter is woken up.  This will make request_wait_answer() wait
2021 * until the request is unlocked and then return.
2022 *
2023 * If the request is asynchronous, then the end function needs to be
2024 * called after waiting for the request to be unlocked (if it was
2025 * locked).
2026 */
2027static void end_io_requests(struct fuse_conn *fc)
2028__releases(fc->lock)
2029__acquires(fc->lock)
2030{
2031	while (!list_empty(&fc->io)) {
2032		struct fuse_req *req =
2033			list_entry(fc->io.next, struct fuse_req, list);
2034		void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
2035
2036		req->aborted = 1;
2037		req->out.h.error = -ECONNABORTED;
2038		req->state = FUSE_REQ_FINISHED;
2039		list_del_init(&req->list);
2040		wake_up(&req->waitq);
2041		if (end) {
2042			req->end = NULL;
2043			__fuse_get_request(req);
2044			spin_unlock(&fc->lock);
2045			wait_event(req->waitq, !req->locked);
2046			end(fc, req);
2047			fuse_put_request(fc, req);
2048			spin_lock(&fc->lock);
2049		}
2050	}
2051}
2052
2053static void end_queued_requests(struct fuse_conn *fc)
2054__releases(fc->lock)
2055__acquires(fc->lock)
2056{
2057	fc->max_background = UINT_MAX;
2058	flush_bg_queue(fc);
2059	end_requests(fc, &fc->pending);
2060	end_requests(fc, &fc->processing);
2061	while (forget_pending(fc))
2062		kfree(dequeue_forget(fc, 1, NULL));
2063}
2064
2065static void end_polls(struct fuse_conn *fc)
2066{
2067	struct rb_node *p;
2068
2069	p = rb_first(&fc->polled_files);
2070
2071	while (p) {
2072		struct fuse_file *ff;
2073		ff = rb_entry(p, struct fuse_file, polled_node);
2074		wake_up_interruptible_all(&ff->poll_wait);
2075
2076		p = rb_next(p);
2077	}
2078}
2079
2080/*
2081 * Abort all requests.
2082 *
2083 * Emergency exit in case of a malicious or accidental deadlock, or
2084 * just a hung filesystem.
2085 *
2086 * The same effect is usually achievable through killing the
2087 * filesystem daemon and all users of the filesystem.  The exception
2088 * is the combination of an asynchronous request and the tricky
2089 * deadlock (see Documentation/filesystems/fuse.txt).
2090 *
2091 * During the aborting, progression of requests from the pending and
2092 * processing lists onto the io list, and progression of new requests
2093 * onto the pending list is prevented by req->connected being false.
2094 *
2095 * Progression of requests under I/O to the processing list is
2096 * prevented by the req->aborted flag being true for these requests.
2097 * For this reason requests on the io list must be aborted first.
2098 */
2099void fuse_abort_conn(struct fuse_conn *fc)
2100{
2101	spin_lock(&fc->lock);
2102	if (fc->connected) {
2103		fc->connected = 0;
2104		fc->blocked = 0;
2105		fc->initialized = 1;
2106		end_io_requests(fc);
2107		end_queued_requests(fc);
2108		end_polls(fc);
2109		wake_up_all(&fc->waitq);
2110		wake_up_all(&fc->blocked_waitq);
2111		kill_fasync(&fc->fasync, SIGIO, POLL_IN);
2112	}
2113	spin_unlock(&fc->lock);
2114}
2115EXPORT_SYMBOL_GPL(fuse_abort_conn);
2116
2117int fuse_dev_release(struct inode *inode, struct file *file)
2118{
2119	struct fuse_conn *fc = fuse_get_conn(file);
2120	if (fc) {
2121		spin_lock(&fc->lock);
2122		fc->connected = 0;
2123		fc->blocked = 0;
2124		fc->initialized = 1;
2125		end_queued_requests(fc);
2126		end_polls(fc);
2127		wake_up_all(&fc->blocked_waitq);
2128		spin_unlock(&fc->lock);
2129		fuse_conn_put(fc);
2130	}
2131
2132	return 0;
2133}
2134EXPORT_SYMBOL_GPL(fuse_dev_release);
2135
2136static int fuse_dev_fasync(int fd, struct file *file, int on)
2137{
2138	struct fuse_conn *fc = fuse_get_conn(file);
2139	if (!fc)
2140		return -EPERM;
2141
2142	/* No locking - fasync_helper does its own locking */
2143	return fasync_helper(fd, file, on, &fc->fasync);
2144}
2145
2146const struct file_operations fuse_dev_operations = {
2147	.owner		= THIS_MODULE,
2148	.llseek		= no_llseek,
2149	.read		= do_sync_read,
2150	.aio_read	= fuse_dev_read,
2151	.splice_read	= fuse_dev_splice_read,
2152	.write		= do_sync_write,
2153	.aio_write	= fuse_dev_write,
2154	.splice_write	= fuse_dev_splice_write,
2155	.poll		= fuse_dev_poll,
2156	.release	= fuse_dev_release,
2157	.fasync		= fuse_dev_fasync,
2158};
2159EXPORT_SYMBOL_GPL(fuse_dev_operations);
2160
2161static struct miscdevice fuse_miscdevice = {
2162	.minor = FUSE_MINOR,
2163	.name  = "fuse",
2164	.fops = &fuse_dev_operations,
2165};
2166
2167int __init fuse_dev_init(void)
2168{
2169	int err = -ENOMEM;
2170	fuse_req_cachep = kmem_cache_create("fuse_request",
2171					    sizeof(struct fuse_req),
2172					    0, 0, NULL);
2173	if (!fuse_req_cachep)
2174		goto out;
2175
2176	err = misc_register(&fuse_miscdevice);
2177	if (err)
2178		goto out_cache_clean;
2179
2180	return 0;
2181
2182 out_cache_clean:
2183	kmem_cache_destroy(fuse_req_cachep);
2184 out:
2185	return err;
2186}
2187
2188void fuse_dev_cleanup(void)
2189{
2190	misc_deregister(&fuse_miscdevice);
2191	kmem_cache_destroy(fuse_req_cachep);
2192}
2193