filemap.c revision ebcf28e1c7a295f3321249dd235ad2e45938fdd9
1/*
2 *	linux/mm/filemap.c
3 *
4 * Copyright (C) 1994-1999  Linus Torvalds
5 */
6
7/*
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
11 */
12#include <linux/config.h>
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/compiler.h>
16#include <linux/fs.h>
17#include <linux/aio.h>
18#include <linux/capability.h>
19#include <linux/kernel_stat.h>
20#include <linux/mm.h>
21#include <linux/swap.h>
22#include <linux/mman.h>
23#include <linux/pagemap.h>
24#include <linux/file.h>
25#include <linux/uio.h>
26#include <linux/hash.h>
27#include <linux/writeback.h>
28#include <linux/pagevec.h>
29#include <linux/blkdev.h>
30#include <linux/security.h>
31#include <linux/syscalls.h>
32#include <linux/cpuset.h>
33#include "filemap.h"
34#include "internal.h"
35
36/*
37 * FIXME: remove all knowledge of the buffer layer from the core VM
38 */
39#include <linux/buffer_head.h> /* for generic_osync_inode */
40
41#include <asm/uaccess.h>
42#include <asm/mman.h>
43
44static ssize_t
45generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
46	loff_t offset, unsigned long nr_segs);
47
48/*
49 * Shared mappings implemented 30.11.1994. It's not fully working yet,
50 * though.
51 *
52 * Shared mappings now work. 15.8.1995  Bruno.
53 *
54 * finished 'unifying' the page and buffer cache and SMP-threaded the
55 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
56 *
57 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
58 */
59
60/*
61 * Lock ordering:
62 *
63 *  ->i_mmap_lock		(vmtruncate)
64 *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
65 *      ->swap_lock		(exclusive_swap_page, others)
66 *        ->mapping->tree_lock
67 *
68 *  ->i_mutex
69 *    ->i_mmap_lock		(truncate->unmap_mapping_range)
70 *
71 *  ->mmap_sem
72 *    ->i_mmap_lock
73 *      ->page_table_lock or pte_lock	(various, mainly in memory.c)
74 *        ->mapping->tree_lock	(arch-dependent flush_dcache_mmap_lock)
75 *
76 *  ->mmap_sem
77 *    ->lock_page		(access_process_vm)
78 *
79 *  ->mmap_sem
80 *    ->i_mutex			(msync)
81 *
82 *  ->i_mutex
83 *    ->i_alloc_sem             (various)
84 *
85 *  ->inode_lock
86 *    ->sb_lock			(fs/fs-writeback.c)
87 *    ->mapping->tree_lock	(__sync_single_inode)
88 *
89 *  ->i_mmap_lock
90 *    ->anon_vma.lock		(vma_adjust)
91 *
92 *  ->anon_vma.lock
93 *    ->page_table_lock or pte_lock	(anon_vma_prepare and various)
94 *
95 *  ->page_table_lock or pte_lock
96 *    ->swap_lock		(try_to_unmap_one)
97 *    ->private_lock		(try_to_unmap_one)
98 *    ->tree_lock		(try_to_unmap_one)
99 *    ->zone.lru_lock		(follow_page->mark_page_accessed)
100 *    ->zone.lru_lock		(check_pte_range->isolate_lru_page)
101 *    ->private_lock		(page_remove_rmap->set_page_dirty)
102 *    ->tree_lock		(page_remove_rmap->set_page_dirty)
103 *    ->inode_lock		(page_remove_rmap->set_page_dirty)
104 *    ->inode_lock		(zap_pte_range->set_page_dirty)
105 *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
106 *
107 *  ->task->proc_lock
108 *    ->dcache_lock		(proc_pid_lookup)
109 */
110
111/*
112 * Remove a page from the page cache and free it. Caller has to make
113 * sure the page is locked and that nobody else uses it - or that usage
114 * is safe.  The caller must hold a write_lock on the mapping's tree_lock.
115 */
116void __remove_from_page_cache(struct page *page)
117{
118	struct address_space *mapping = page->mapping;
119
120	radix_tree_delete(&mapping->page_tree, page->index);
121	page->mapping = NULL;
122	mapping->nrpages--;
123	pagecache_acct(-1);
124}
125
126void remove_from_page_cache(struct page *page)
127{
128	struct address_space *mapping = page->mapping;
129
130	BUG_ON(!PageLocked(page));
131
132	write_lock_irq(&mapping->tree_lock);
133	__remove_from_page_cache(page);
134	write_unlock_irq(&mapping->tree_lock);
135}
136
137static int sync_page(void *word)
138{
139	struct address_space *mapping;
140	struct page *page;
141
142	page = container_of((unsigned long *)word, struct page, flags);
143
144	/*
145	 * page_mapping() is being called without PG_locked held.
146	 * Some knowledge of the state and use of the page is used to
147	 * reduce the requirements down to a memory barrier.
148	 * The danger here is of a stale page_mapping() return value
149	 * indicating a struct address_space different from the one it's
150	 * associated with when it is associated with one.
151	 * After smp_mb(), it's either the correct page_mapping() for
152	 * the page, or an old page_mapping() and the page's own
153	 * page_mapping() has gone NULL.
154	 * The ->sync_page() address_space operation must tolerate
155	 * page_mapping() going NULL. By an amazing coincidence,
156	 * this comes about because none of the users of the page
157	 * in the ->sync_page() methods make essential use of the
158	 * page_mapping(), merely passing the page down to the backing
159	 * device's unplug functions when it's non-NULL, which in turn
160	 * ignore it for all cases but swap, where only page_private(page) is
161	 * of interest. When page_mapping() does go NULL, the entire
162	 * call stack gracefully ignores the page and returns.
163	 * -- wli
164	 */
165	smp_mb();
166	mapping = page_mapping(page);
167	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
168		mapping->a_ops->sync_page(page);
169	io_schedule();
170	return 0;
171}
172
173/**
174 * filemap_fdatawrite_range - start writeback against all of a mapping's
175 * dirty pages that lie within the byte offsets <start, end>
176 * @mapping:	address space structure to write
177 * @start:	offset in bytes where the range starts
178 * @end:	offset in bytes where the range ends (inclusive)
179 * @sync_mode:	enable synchronous operation
180 *
181 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
182 * opposed to a regular memory * cleansing writeback.  The difference between
183 * these two operations is that if a dirty page/buffer is encountered, it must
184 * be waited upon, and not just skipped over.
185 */
186int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
187				loff_t end, int sync_mode)
188{
189	int ret;
190	struct writeback_control wbc = {
191		.sync_mode = sync_mode,
192		.nr_to_write = mapping->nrpages * 2,
193		.start = start,
194		.end = end,
195	};
196
197	if (!mapping_cap_writeback_dirty(mapping))
198		return 0;
199
200	ret = do_writepages(mapping, &wbc);
201	return ret;
202}
203
204static inline int __filemap_fdatawrite(struct address_space *mapping,
205	int sync_mode)
206{
207	return __filemap_fdatawrite_range(mapping, 0, 0, sync_mode);
208}
209
210int filemap_fdatawrite(struct address_space *mapping)
211{
212	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
213}
214EXPORT_SYMBOL(filemap_fdatawrite);
215
216static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
217				loff_t end)
218{
219	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
220}
221
222/*
223 * This is a mostly non-blocking flush.  Not suitable for data-integrity
224 * purposes - I/O may not be started against all dirty pages.
225 */
226int filemap_flush(struct address_space *mapping)
227{
228	return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
229}
230EXPORT_SYMBOL(filemap_flush);
231
232/*
233 * Wait for writeback to complete against pages indexed by start->end
234 * inclusive
235 */
236int wait_on_page_writeback_range(struct address_space *mapping,
237				pgoff_t start, pgoff_t end)
238{
239	struct pagevec pvec;
240	int nr_pages;
241	int ret = 0;
242	pgoff_t index;
243
244	if (end < start)
245		return 0;
246
247	pagevec_init(&pvec, 0);
248	index = start;
249	while ((index <= end) &&
250			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
251			PAGECACHE_TAG_WRITEBACK,
252			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
253		unsigned i;
254
255		for (i = 0; i < nr_pages; i++) {
256			struct page *page = pvec.pages[i];
257
258			/* until radix tree lookup accepts end_index */
259			if (page->index > end)
260				continue;
261
262			wait_on_page_writeback(page);
263			if (PageError(page))
264				ret = -EIO;
265		}
266		pagevec_release(&pvec);
267		cond_resched();
268	}
269
270	/* Check for outstanding write errors */
271	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
272		ret = -ENOSPC;
273	if (test_and_clear_bit(AS_EIO, &mapping->flags))
274		ret = -EIO;
275
276	return ret;
277}
278
279/*
280 * Write and wait upon all the pages in the passed range.  This is a "data
281 * integrity" operation.  It waits upon in-flight writeout before starting and
282 * waiting upon new writeout.  If there was an IO error, return it.
283 *
284 * We need to re-take i_mutex during the generic_osync_inode list walk because
285 * it is otherwise livelockable.
286 */
287int sync_page_range(struct inode *inode, struct address_space *mapping,
288			loff_t pos, loff_t count)
289{
290	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
291	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
292	int ret;
293
294	if (!mapping_cap_writeback_dirty(mapping) || !count)
295		return 0;
296	ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
297	if (ret == 0) {
298		mutex_lock(&inode->i_mutex);
299		ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
300		mutex_unlock(&inode->i_mutex);
301	}
302	if (ret == 0)
303		ret = wait_on_page_writeback_range(mapping, start, end);
304	return ret;
305}
306EXPORT_SYMBOL(sync_page_range);
307
308/*
309 * Note: Holding i_mutex across sync_page_range_nolock is not a good idea
310 * as it forces O_SYNC writers to different parts of the same file
311 * to be serialised right until io completion.
312 */
313int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
314			   loff_t pos, loff_t count)
315{
316	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
317	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
318	int ret;
319
320	if (!mapping_cap_writeback_dirty(mapping) || !count)
321		return 0;
322	ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
323	if (ret == 0)
324		ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
325	if (ret == 0)
326		ret = wait_on_page_writeback_range(mapping, start, end);
327	return ret;
328}
329EXPORT_SYMBOL(sync_page_range_nolock);
330
331/**
332 * filemap_fdatawait - walk the list of under-writeback pages of the given
333 *     address space and wait for all of them.
334 *
335 * @mapping: address space structure to wait for
336 */
337int filemap_fdatawait(struct address_space *mapping)
338{
339	loff_t i_size = i_size_read(mapping->host);
340
341	if (i_size == 0)
342		return 0;
343
344	return wait_on_page_writeback_range(mapping, 0,
345				(i_size - 1) >> PAGE_CACHE_SHIFT);
346}
347EXPORT_SYMBOL(filemap_fdatawait);
348
349int filemap_write_and_wait(struct address_space *mapping)
350{
351	int err = 0;
352
353	if (mapping->nrpages) {
354		err = filemap_fdatawrite(mapping);
355		/*
356		 * Even if the above returned error, the pages may be
357		 * written partially (e.g. -ENOSPC), so we wait for it.
358		 * But the -EIO is special case, it may indicate the worst
359		 * thing (e.g. bug) happened, so we avoid waiting for it.
360		 */
361		if (err != -EIO) {
362			int err2 = filemap_fdatawait(mapping);
363			if (!err)
364				err = err2;
365		}
366	}
367	return err;
368}
369EXPORT_SYMBOL(filemap_write_and_wait);
370
371/*
372 * Write out and wait upon file offsets lstart->lend, inclusive.
373 *
374 * Note that `lend' is inclusive (describes the last byte to be written) so
375 * that this function can be used to write to the very end-of-file (end = -1).
376 */
377int filemap_write_and_wait_range(struct address_space *mapping,
378				 loff_t lstart, loff_t lend)
379{
380	int err = 0;
381
382	if (mapping->nrpages) {
383		err = __filemap_fdatawrite_range(mapping, lstart, lend,
384						 WB_SYNC_ALL);
385		/* See comment of filemap_write_and_wait() */
386		if (err != -EIO) {
387			int err2 = wait_on_page_writeback_range(mapping,
388						lstart >> PAGE_CACHE_SHIFT,
389						lend >> PAGE_CACHE_SHIFT);
390			if (!err)
391				err = err2;
392		}
393	}
394	return err;
395}
396
397/*
398 * This function is used to add newly allocated pagecache pages:
399 * the page is new, so we can just run SetPageLocked() against it.
400 * The other page state flags were set by rmqueue().
401 *
402 * This function does not add the page to the LRU.  The caller must do that.
403 */
404int add_to_page_cache(struct page *page, struct address_space *mapping,
405		pgoff_t offset, gfp_t gfp_mask)
406{
407	int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
408
409	if (error == 0) {
410		write_lock_irq(&mapping->tree_lock);
411		error = radix_tree_insert(&mapping->page_tree, offset, page);
412		if (!error) {
413			page_cache_get(page);
414			SetPageLocked(page);
415			page->mapping = mapping;
416			page->index = offset;
417			mapping->nrpages++;
418			pagecache_acct(1);
419		}
420		write_unlock_irq(&mapping->tree_lock);
421		radix_tree_preload_end();
422	}
423	return error;
424}
425
426EXPORT_SYMBOL(add_to_page_cache);
427
428int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
429				pgoff_t offset, gfp_t gfp_mask)
430{
431	int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
432	if (ret == 0)
433		lru_cache_add(page);
434	return ret;
435}
436
437#ifdef CONFIG_NUMA
438struct page *page_cache_alloc(struct address_space *x)
439{
440	if (cpuset_do_page_mem_spread()) {
441		int n = cpuset_mem_spread_node();
442		return alloc_pages_node(n, mapping_gfp_mask(x), 0);
443	}
444	return alloc_pages(mapping_gfp_mask(x), 0);
445}
446EXPORT_SYMBOL(page_cache_alloc);
447
448struct page *page_cache_alloc_cold(struct address_space *x)
449{
450	if (cpuset_do_page_mem_spread()) {
451		int n = cpuset_mem_spread_node();
452		return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0);
453	}
454	return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
455}
456EXPORT_SYMBOL(page_cache_alloc_cold);
457#endif
458
459/*
460 * In order to wait for pages to become available there must be
461 * waitqueues associated with pages. By using a hash table of
462 * waitqueues where the bucket discipline is to maintain all
463 * waiters on the same queue and wake all when any of the pages
464 * become available, and for the woken contexts to check to be
465 * sure the appropriate page became available, this saves space
466 * at a cost of "thundering herd" phenomena during rare hash
467 * collisions.
468 */
469static wait_queue_head_t *page_waitqueue(struct page *page)
470{
471	const struct zone *zone = page_zone(page);
472
473	return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
474}
475
476static inline void wake_up_page(struct page *page, int bit)
477{
478	__wake_up_bit(page_waitqueue(page), &page->flags, bit);
479}
480
481void fastcall wait_on_page_bit(struct page *page, int bit_nr)
482{
483	DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
484
485	if (test_bit(bit_nr, &page->flags))
486		__wait_on_bit(page_waitqueue(page), &wait, sync_page,
487							TASK_UNINTERRUPTIBLE);
488}
489EXPORT_SYMBOL(wait_on_page_bit);
490
491/**
492 * unlock_page() - unlock a locked page
493 *
494 * @page: the page
495 *
496 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
497 * Also wakes sleepers in wait_on_page_writeback() because the wakeup
498 * mechananism between PageLocked pages and PageWriteback pages is shared.
499 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
500 *
501 * The first mb is necessary to safely close the critical section opened by the
502 * TestSetPageLocked(), the second mb is necessary to enforce ordering between
503 * the clear_bit and the read of the waitqueue (to avoid SMP races with a
504 * parallel wait_on_page_locked()).
505 */
506void fastcall unlock_page(struct page *page)
507{
508	smp_mb__before_clear_bit();
509	if (!TestClearPageLocked(page))
510		BUG();
511	smp_mb__after_clear_bit();
512	wake_up_page(page, PG_locked);
513}
514EXPORT_SYMBOL(unlock_page);
515
516/*
517 * End writeback against a page.
518 */
519void end_page_writeback(struct page *page)
520{
521	if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) {
522		if (!test_clear_page_writeback(page))
523			BUG();
524	}
525	smp_mb__after_clear_bit();
526	wake_up_page(page, PG_writeback);
527}
528EXPORT_SYMBOL(end_page_writeback);
529
530/*
531 * Get a lock on the page, assuming we need to sleep to get it.
532 *
533 * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
534 * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
535 * chances are that on the second loop, the block layer's plug list is empty,
536 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
537 */
538void fastcall __lock_page(struct page *page)
539{
540	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
541
542	__wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
543							TASK_UNINTERRUPTIBLE);
544}
545EXPORT_SYMBOL(__lock_page);
546
547/*
548 * a rather lightweight function, finding and getting a reference to a
549 * hashed page atomically.
550 */
551struct page * find_get_page(struct address_space *mapping, unsigned long offset)
552{
553	struct page *page;
554
555	read_lock_irq(&mapping->tree_lock);
556	page = radix_tree_lookup(&mapping->page_tree, offset);
557	if (page)
558		page_cache_get(page);
559	read_unlock_irq(&mapping->tree_lock);
560	return page;
561}
562
563EXPORT_SYMBOL(find_get_page);
564
565/*
566 * Same as above, but trylock it instead of incrementing the count.
567 */
568struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
569{
570	struct page *page;
571
572	read_lock_irq(&mapping->tree_lock);
573	page = radix_tree_lookup(&mapping->page_tree, offset);
574	if (page && TestSetPageLocked(page))
575		page = NULL;
576	read_unlock_irq(&mapping->tree_lock);
577	return page;
578}
579
580EXPORT_SYMBOL(find_trylock_page);
581
582/**
583 * find_lock_page - locate, pin and lock a pagecache page
584 *
585 * @mapping: the address_space to search
586 * @offset: the page index
587 *
588 * Locates the desired pagecache page, locks it, increments its reference
589 * count and returns its address.
590 *
591 * Returns zero if the page was not present. find_lock_page() may sleep.
592 */
593struct page *find_lock_page(struct address_space *mapping,
594				unsigned long offset)
595{
596	struct page *page;
597
598	read_lock_irq(&mapping->tree_lock);
599repeat:
600	page = radix_tree_lookup(&mapping->page_tree, offset);
601	if (page) {
602		page_cache_get(page);
603		if (TestSetPageLocked(page)) {
604			read_unlock_irq(&mapping->tree_lock);
605			__lock_page(page);
606			read_lock_irq(&mapping->tree_lock);
607
608			/* Has the page been truncated while we slept? */
609			if (unlikely(page->mapping != mapping ||
610				     page->index != offset)) {
611				unlock_page(page);
612				page_cache_release(page);
613				goto repeat;
614			}
615		}
616	}
617	read_unlock_irq(&mapping->tree_lock);
618	return page;
619}
620
621EXPORT_SYMBOL(find_lock_page);
622
623/**
624 * find_or_create_page - locate or add a pagecache page
625 *
626 * @mapping: the page's address_space
627 * @index: the page's index into the mapping
628 * @gfp_mask: page allocation mode
629 *
630 * Locates a page in the pagecache.  If the page is not present, a new page
631 * is allocated using @gfp_mask and is added to the pagecache and to the VM's
632 * LRU list.  The returned page is locked and has its reference count
633 * incremented.
634 *
635 * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
636 * allocation!
637 *
638 * find_or_create_page() returns the desired page's address, or zero on
639 * memory exhaustion.
640 */
641struct page *find_or_create_page(struct address_space *mapping,
642		unsigned long index, gfp_t gfp_mask)
643{
644	struct page *page, *cached_page = NULL;
645	int err;
646repeat:
647	page = find_lock_page(mapping, index);
648	if (!page) {
649		if (!cached_page) {
650			cached_page = alloc_page(gfp_mask);
651			if (!cached_page)
652				return NULL;
653		}
654		err = add_to_page_cache_lru(cached_page, mapping,
655					index, gfp_mask);
656		if (!err) {
657			page = cached_page;
658			cached_page = NULL;
659		} else if (err == -EEXIST)
660			goto repeat;
661	}
662	if (cached_page)
663		page_cache_release(cached_page);
664	return page;
665}
666
667EXPORT_SYMBOL(find_or_create_page);
668
669/**
670 * find_get_pages - gang pagecache lookup
671 * @mapping:	The address_space to search
672 * @start:	The starting page index
673 * @nr_pages:	The maximum number of pages
674 * @pages:	Where the resulting pages are placed
675 *
676 * find_get_pages() will search for and return a group of up to
677 * @nr_pages pages in the mapping.  The pages are placed at @pages.
678 * find_get_pages() takes a reference against the returned pages.
679 *
680 * The search returns a group of mapping-contiguous pages with ascending
681 * indexes.  There may be holes in the indices due to not-present pages.
682 *
683 * find_get_pages() returns the number of pages which were found.
684 */
685unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
686			    unsigned int nr_pages, struct page **pages)
687{
688	unsigned int i;
689	unsigned int ret;
690
691	read_lock_irq(&mapping->tree_lock);
692	ret = radix_tree_gang_lookup(&mapping->page_tree,
693				(void **)pages, start, nr_pages);
694	for (i = 0; i < ret; i++)
695		page_cache_get(pages[i]);
696	read_unlock_irq(&mapping->tree_lock);
697	return ret;
698}
699
700/*
701 * Like find_get_pages, except we only return pages which are tagged with
702 * `tag'.   We update *index to index the next page for the traversal.
703 */
704unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
705			int tag, unsigned int nr_pages, struct page **pages)
706{
707	unsigned int i;
708	unsigned int ret;
709
710	read_lock_irq(&mapping->tree_lock);
711	ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
712				(void **)pages, *index, nr_pages, tag);
713	for (i = 0; i < ret; i++)
714		page_cache_get(pages[i]);
715	if (ret)
716		*index = pages[ret - 1]->index + 1;
717	read_unlock_irq(&mapping->tree_lock);
718	return ret;
719}
720
721/*
722 * Same as grab_cache_page, but do not wait if the page is unavailable.
723 * This is intended for speculative data generators, where the data can
724 * be regenerated if the page couldn't be grabbed.  This routine should
725 * be safe to call while holding the lock for another page.
726 *
727 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
728 * and deadlock against the caller's locked page.
729 */
730struct page *
731grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
732{
733	struct page *page = find_get_page(mapping, index);
734	gfp_t gfp_mask;
735
736	if (page) {
737		if (!TestSetPageLocked(page))
738			return page;
739		page_cache_release(page);
740		return NULL;
741	}
742	gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS;
743	page = alloc_pages(gfp_mask, 0);
744	if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) {
745		page_cache_release(page);
746		page = NULL;
747	}
748	return page;
749}
750
751EXPORT_SYMBOL(grab_cache_page_nowait);
752
753/*
754 * This is a generic file read routine, and uses the
755 * mapping->a_ops->readpage() function for the actual low-level
756 * stuff.
757 *
758 * This is really ugly. But the goto's actually try to clarify some
759 * of the logic when it comes to error handling etc.
760 *
761 * Note the struct file* is only passed for the use of readpage.  It may be
762 * NULL.
763 */
764void do_generic_mapping_read(struct address_space *mapping,
765			     struct file_ra_state *_ra,
766			     struct file *filp,
767			     loff_t *ppos,
768			     read_descriptor_t *desc,
769			     read_actor_t actor)
770{
771	struct inode *inode = mapping->host;
772	unsigned long index;
773	unsigned long end_index;
774	unsigned long offset;
775	unsigned long last_index;
776	unsigned long next_index;
777	unsigned long prev_index;
778	loff_t isize;
779	struct page *cached_page;
780	int error;
781	struct file_ra_state ra = *_ra;
782
783	cached_page = NULL;
784	index = *ppos >> PAGE_CACHE_SHIFT;
785	next_index = index;
786	prev_index = ra.prev_page;
787	last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
788	offset = *ppos & ~PAGE_CACHE_MASK;
789
790	isize = i_size_read(inode);
791	if (!isize)
792		goto out;
793
794	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
795	for (;;) {
796		struct page *page;
797		unsigned long nr, ret;
798
799		/* nr is the maximum number of bytes to copy from this page */
800		nr = PAGE_CACHE_SIZE;
801		if (index >= end_index) {
802			if (index > end_index)
803				goto out;
804			nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
805			if (nr <= offset) {
806				goto out;
807			}
808		}
809		nr = nr - offset;
810
811		cond_resched();
812		if (index == next_index)
813			next_index = page_cache_readahead(mapping, &ra, filp,
814					index, last_index - index);
815
816find_page:
817		page = find_get_page(mapping, index);
818		if (unlikely(page == NULL)) {
819			handle_ra_miss(mapping, &ra, index);
820			goto no_cached_page;
821		}
822		if (!PageUptodate(page))
823			goto page_not_up_to_date;
824page_ok:
825
826		/* If users can be writing to this page using arbitrary
827		 * virtual addresses, take care about potential aliasing
828		 * before reading the page on the kernel side.
829		 */
830		if (mapping_writably_mapped(mapping))
831			flush_dcache_page(page);
832
833		/*
834		 * When (part of) the same page is read multiple times
835		 * in succession, only mark it as accessed the first time.
836		 */
837		if (prev_index != index)
838			mark_page_accessed(page);
839		prev_index = index;
840
841		/*
842		 * Ok, we have the page, and it's up-to-date, so
843		 * now we can copy it to user space...
844		 *
845		 * The actor routine returns how many bytes were actually used..
846		 * NOTE! This may not be the same as how much of a user buffer
847		 * we filled up (we may be padding etc), so we can only update
848		 * "pos" here (the actor routine has to update the user buffer
849		 * pointers and the remaining count).
850		 */
851		ret = actor(desc, page, offset, nr);
852		offset += ret;
853		index += offset >> PAGE_CACHE_SHIFT;
854		offset &= ~PAGE_CACHE_MASK;
855
856		page_cache_release(page);
857		if (ret == nr && desc->count)
858			continue;
859		goto out;
860
861page_not_up_to_date:
862		/* Get exclusive access to the page ... */
863		lock_page(page);
864
865		/* Did it get unhashed before we got the lock? */
866		if (!page->mapping) {
867			unlock_page(page);
868			page_cache_release(page);
869			continue;
870		}
871
872		/* Did somebody else fill it already? */
873		if (PageUptodate(page)) {
874			unlock_page(page);
875			goto page_ok;
876		}
877
878readpage:
879		/* Start the actual read. The read will unlock the page. */
880		error = mapping->a_ops->readpage(filp, page);
881
882		if (unlikely(error)) {
883			if (error == AOP_TRUNCATED_PAGE) {
884				page_cache_release(page);
885				goto find_page;
886			}
887			goto readpage_error;
888		}
889
890		if (!PageUptodate(page)) {
891			lock_page(page);
892			if (!PageUptodate(page)) {
893				if (page->mapping == NULL) {
894					/*
895					 * invalidate_inode_pages got it
896					 */
897					unlock_page(page);
898					page_cache_release(page);
899					goto find_page;
900				}
901				unlock_page(page);
902				error = -EIO;
903				goto readpage_error;
904			}
905			unlock_page(page);
906		}
907
908		/*
909		 * i_size must be checked after we have done ->readpage.
910		 *
911		 * Checking i_size after the readpage allows us to calculate
912		 * the correct value for "nr", which means the zero-filled
913		 * part of the page is not copied back to userspace (unless
914		 * another truncate extends the file - this is desired though).
915		 */
916		isize = i_size_read(inode);
917		end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
918		if (unlikely(!isize || index > end_index)) {
919			page_cache_release(page);
920			goto out;
921		}
922
923		/* nr is the maximum number of bytes to copy from this page */
924		nr = PAGE_CACHE_SIZE;
925		if (index == end_index) {
926			nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
927			if (nr <= offset) {
928				page_cache_release(page);
929				goto out;
930			}
931		}
932		nr = nr - offset;
933		goto page_ok;
934
935readpage_error:
936		/* UHHUH! A synchronous read error occurred. Report it */
937		desc->error = error;
938		page_cache_release(page);
939		goto out;
940
941no_cached_page:
942		/*
943		 * Ok, it wasn't cached, so we need to create a new
944		 * page..
945		 */
946		if (!cached_page) {
947			cached_page = page_cache_alloc_cold(mapping);
948			if (!cached_page) {
949				desc->error = -ENOMEM;
950				goto out;
951			}
952		}
953		error = add_to_page_cache_lru(cached_page, mapping,
954						index, GFP_KERNEL);
955		if (error) {
956			if (error == -EEXIST)
957				goto find_page;
958			desc->error = error;
959			goto out;
960		}
961		page = cached_page;
962		cached_page = NULL;
963		goto readpage;
964	}
965
966out:
967	*_ra = ra;
968
969	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
970	if (cached_page)
971		page_cache_release(cached_page);
972	if (filp)
973		file_accessed(filp);
974}
975
976EXPORT_SYMBOL(do_generic_mapping_read);
977
978int file_read_actor(read_descriptor_t *desc, struct page *page,
979			unsigned long offset, unsigned long size)
980{
981	char *kaddr;
982	unsigned long left, count = desc->count;
983
984	if (size > count)
985		size = count;
986
987	/*
988	 * Faults on the destination of a read are common, so do it before
989	 * taking the kmap.
990	 */
991	if (!fault_in_pages_writeable(desc->arg.buf, size)) {
992		kaddr = kmap_atomic(page, KM_USER0);
993		left = __copy_to_user_inatomic(desc->arg.buf,
994						kaddr + offset, size);
995		kunmap_atomic(kaddr, KM_USER0);
996		if (left == 0)
997			goto success;
998	}
999
1000	/* Do it the slow way */
1001	kaddr = kmap(page);
1002	left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
1003	kunmap(page);
1004
1005	if (left) {
1006		size -= left;
1007		desc->error = -EFAULT;
1008	}
1009success:
1010	desc->count = count - size;
1011	desc->written += size;
1012	desc->arg.buf += size;
1013	return size;
1014}
1015
1016/*
1017 * This is the "read()" routine for all filesystems
1018 * that can use the page cache directly.
1019 */
1020ssize_t
1021__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1022		unsigned long nr_segs, loff_t *ppos)
1023{
1024	struct file *filp = iocb->ki_filp;
1025	ssize_t retval;
1026	unsigned long seg;
1027	size_t count;
1028
1029	count = 0;
1030	for (seg = 0; seg < nr_segs; seg++) {
1031		const struct iovec *iv = &iov[seg];
1032
1033		/*
1034		 * If any segment has a negative length, or the cumulative
1035		 * length ever wraps negative then return -EINVAL.
1036		 */
1037		count += iv->iov_len;
1038		if (unlikely((ssize_t)(count|iv->iov_len) < 0))
1039			return -EINVAL;
1040		if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
1041			continue;
1042		if (seg == 0)
1043			return -EFAULT;
1044		nr_segs = seg;
1045		count -= iv->iov_len;	/* This segment is no good */
1046		break;
1047	}
1048
1049	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1050	if (filp->f_flags & O_DIRECT) {
1051		loff_t pos = *ppos, size;
1052		struct address_space *mapping;
1053		struct inode *inode;
1054
1055		mapping = filp->f_mapping;
1056		inode = mapping->host;
1057		retval = 0;
1058		if (!count)
1059			goto out; /* skip atime */
1060		size = i_size_read(inode);
1061		if (pos < size) {
1062			retval = generic_file_direct_IO(READ, iocb,
1063						iov, pos, nr_segs);
1064			if (retval > 0 && !is_sync_kiocb(iocb))
1065				retval = -EIOCBQUEUED;
1066			if (retval > 0)
1067				*ppos = pos + retval;
1068		}
1069		file_accessed(filp);
1070		goto out;
1071	}
1072
1073	retval = 0;
1074	if (count) {
1075		for (seg = 0; seg < nr_segs; seg++) {
1076			read_descriptor_t desc;
1077
1078			desc.written = 0;
1079			desc.arg.buf = iov[seg].iov_base;
1080			desc.count = iov[seg].iov_len;
1081			if (desc.count == 0)
1082				continue;
1083			desc.error = 0;
1084			do_generic_file_read(filp,ppos,&desc,file_read_actor);
1085			retval += desc.written;
1086			if (desc.error) {
1087				retval = retval ?: desc.error;
1088				break;
1089			}
1090		}
1091	}
1092out:
1093	return retval;
1094}
1095
1096EXPORT_SYMBOL(__generic_file_aio_read);
1097
1098ssize_t
1099generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
1100{
1101	struct iovec local_iov = { .iov_base = buf, .iov_len = count };
1102
1103	BUG_ON(iocb->ki_pos != pos);
1104	return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
1105}
1106
1107EXPORT_SYMBOL(generic_file_aio_read);
1108
1109ssize_t
1110generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
1111{
1112	struct iovec local_iov = { .iov_base = buf, .iov_len = count };
1113	struct kiocb kiocb;
1114	ssize_t ret;
1115
1116	init_sync_kiocb(&kiocb, filp);
1117	ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos);
1118	if (-EIOCBQUEUED == ret)
1119		ret = wait_on_sync_kiocb(&kiocb);
1120	return ret;
1121}
1122
1123EXPORT_SYMBOL(generic_file_read);
1124
1125int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1126{
1127	ssize_t written;
1128	unsigned long count = desc->count;
1129	struct file *file = desc->arg.data;
1130
1131	if (size > count)
1132		size = count;
1133
1134	written = file->f_op->sendpage(file, page, offset,
1135				       size, &file->f_pos, size<count);
1136	if (written < 0) {
1137		desc->error = written;
1138		written = 0;
1139	}
1140	desc->count = count - written;
1141	desc->written += written;
1142	return written;
1143}
1144
1145ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
1146			 size_t count, read_actor_t actor, void *target)
1147{
1148	read_descriptor_t desc;
1149
1150	if (!count)
1151		return 0;
1152
1153	desc.written = 0;
1154	desc.count = count;
1155	desc.arg.data = target;
1156	desc.error = 0;
1157
1158	do_generic_file_read(in_file, ppos, &desc, actor);
1159	if (desc.written)
1160		return desc.written;
1161	return desc.error;
1162}
1163
1164EXPORT_SYMBOL(generic_file_sendfile);
1165
1166static ssize_t
1167do_readahead(struct address_space *mapping, struct file *filp,
1168	     unsigned long index, unsigned long nr)
1169{
1170	if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1171		return -EINVAL;
1172
1173	force_page_cache_readahead(mapping, filp, index,
1174					max_sane_readahead(nr));
1175	return 0;
1176}
1177
1178asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1179{
1180	ssize_t ret;
1181	struct file *file;
1182
1183	ret = -EBADF;
1184	file = fget(fd);
1185	if (file) {
1186		if (file->f_mode & FMODE_READ) {
1187			struct address_space *mapping = file->f_mapping;
1188			unsigned long start = offset >> PAGE_CACHE_SHIFT;
1189			unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
1190			unsigned long len = end - start + 1;
1191			ret = do_readahead(mapping, file, start, len);
1192		}
1193		fput(file);
1194	}
1195	return ret;
1196}
1197
1198#ifdef CONFIG_MMU
1199/*
1200 * This adds the requested page to the page cache if it isn't already there,
1201 * and schedules an I/O to read in its contents from disk.
1202 */
1203static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
1204static int fastcall page_cache_read(struct file * file, unsigned long offset)
1205{
1206	struct address_space *mapping = file->f_mapping;
1207	struct page *page;
1208	int ret;
1209
1210	do {
1211		page = page_cache_alloc_cold(mapping);
1212		if (!page)
1213			return -ENOMEM;
1214
1215		ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1216		if (ret == 0)
1217			ret = mapping->a_ops->readpage(file, page);
1218		else if (ret == -EEXIST)
1219			ret = 0; /* losing race to add is OK */
1220
1221		page_cache_release(page);
1222
1223	} while (ret == AOP_TRUNCATED_PAGE);
1224
1225	return ret;
1226}
1227
1228#define MMAP_LOTSAMISS  (100)
1229
1230/*
1231 * filemap_nopage() is invoked via the vma operations vector for a
1232 * mapped memory region to read in file data during a page fault.
1233 *
1234 * The goto's are kind of ugly, but this streamlines the normal case of having
1235 * it in the page cache, and handles the special cases reasonably without
1236 * having a lot of duplicated code.
1237 */
1238struct page *filemap_nopage(struct vm_area_struct *area,
1239				unsigned long address, int *type)
1240{
1241	int error;
1242	struct file *file = area->vm_file;
1243	struct address_space *mapping = file->f_mapping;
1244	struct file_ra_state *ra = &file->f_ra;
1245	struct inode *inode = mapping->host;
1246	struct page *page;
1247	unsigned long size, pgoff;
1248	int did_readaround = 0, majmin = VM_FAULT_MINOR;
1249
1250	pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1251
1252retry_all:
1253	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1254	if (pgoff >= size)
1255		goto outside_data_content;
1256
1257	/* If we don't want any read-ahead, don't bother */
1258	if (VM_RandomReadHint(area))
1259		goto no_cached_page;
1260
1261	/*
1262	 * The readahead code wants to be told about each and every page
1263	 * so it can build and shrink its windows appropriately
1264	 *
1265	 * For sequential accesses, we use the generic readahead logic.
1266	 */
1267	if (VM_SequentialReadHint(area))
1268		page_cache_readahead(mapping, ra, file, pgoff, 1);
1269
1270	/*
1271	 * Do we have something in the page cache already?
1272	 */
1273retry_find:
1274	page = find_get_page(mapping, pgoff);
1275	if (!page) {
1276		unsigned long ra_pages;
1277
1278		if (VM_SequentialReadHint(area)) {
1279			handle_ra_miss(mapping, ra, pgoff);
1280			goto no_cached_page;
1281		}
1282		ra->mmap_miss++;
1283
1284		/*
1285		 * Do we miss much more than hit in this file? If so,
1286		 * stop bothering with read-ahead. It will only hurt.
1287		 */
1288		if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS)
1289			goto no_cached_page;
1290
1291		/*
1292		 * To keep the pgmajfault counter straight, we need to
1293		 * check did_readaround, as this is an inner loop.
1294		 */
1295		if (!did_readaround) {
1296			majmin = VM_FAULT_MAJOR;
1297			inc_page_state(pgmajfault);
1298		}
1299		did_readaround = 1;
1300		ra_pages = max_sane_readahead(file->f_ra.ra_pages);
1301		if (ra_pages) {
1302			pgoff_t start = 0;
1303
1304			if (pgoff > ra_pages / 2)
1305				start = pgoff - ra_pages / 2;
1306			do_page_cache_readahead(mapping, file, start, ra_pages);
1307		}
1308		page = find_get_page(mapping, pgoff);
1309		if (!page)
1310			goto no_cached_page;
1311	}
1312
1313	if (!did_readaround)
1314		ra->mmap_hit++;
1315
1316	/*
1317	 * Ok, found a page in the page cache, now we need to check
1318	 * that it's up-to-date.
1319	 */
1320	if (!PageUptodate(page))
1321		goto page_not_uptodate;
1322
1323success:
1324	/*
1325	 * Found the page and have a reference on it.
1326	 */
1327	mark_page_accessed(page);
1328	if (type)
1329		*type = majmin;
1330	return page;
1331
1332outside_data_content:
1333	/*
1334	 * An external ptracer can access pages that normally aren't
1335	 * accessible..
1336	 */
1337	if (area->vm_mm == current->mm)
1338		return NULL;
1339	/* Fall through to the non-read-ahead case */
1340no_cached_page:
1341	/*
1342	 * We're only likely to ever get here if MADV_RANDOM is in
1343	 * effect.
1344	 */
1345	error = page_cache_read(file, pgoff);
1346	grab_swap_token();
1347
1348	/*
1349	 * The page we want has now been added to the page cache.
1350	 * In the unlikely event that someone removed it in the
1351	 * meantime, we'll just come back here and read it again.
1352	 */
1353	if (error >= 0)
1354		goto retry_find;
1355
1356	/*
1357	 * An error return from page_cache_read can result if the
1358	 * system is low on memory, or a problem occurs while trying
1359	 * to schedule I/O.
1360	 */
1361	if (error == -ENOMEM)
1362		return NOPAGE_OOM;
1363	return NULL;
1364
1365page_not_uptodate:
1366	if (!did_readaround) {
1367		majmin = VM_FAULT_MAJOR;
1368		inc_page_state(pgmajfault);
1369	}
1370	lock_page(page);
1371
1372	/* Did it get unhashed while we waited for it? */
1373	if (!page->mapping) {
1374		unlock_page(page);
1375		page_cache_release(page);
1376		goto retry_all;
1377	}
1378
1379	/* Did somebody else get it up-to-date? */
1380	if (PageUptodate(page)) {
1381		unlock_page(page);
1382		goto success;
1383	}
1384
1385	error = mapping->a_ops->readpage(file, page);
1386	if (!error) {
1387		wait_on_page_locked(page);
1388		if (PageUptodate(page))
1389			goto success;
1390	} else if (error == AOP_TRUNCATED_PAGE) {
1391		page_cache_release(page);
1392		goto retry_find;
1393	}
1394
1395	/*
1396	 * Umm, take care of errors if the page isn't up-to-date.
1397	 * Try to re-read it _once_. We do this synchronously,
1398	 * because there really aren't any performance issues here
1399	 * and we need to check for errors.
1400	 */
1401	lock_page(page);
1402
1403	/* Somebody truncated the page on us? */
1404	if (!page->mapping) {
1405		unlock_page(page);
1406		page_cache_release(page);
1407		goto retry_all;
1408	}
1409
1410	/* Somebody else successfully read it in? */
1411	if (PageUptodate(page)) {
1412		unlock_page(page);
1413		goto success;
1414	}
1415	ClearPageError(page);
1416	error = mapping->a_ops->readpage(file, page);
1417	if (!error) {
1418		wait_on_page_locked(page);
1419		if (PageUptodate(page))
1420			goto success;
1421	} else if (error == AOP_TRUNCATED_PAGE) {
1422		page_cache_release(page);
1423		goto retry_find;
1424	}
1425
1426	/*
1427	 * Things didn't work out. Return zero to tell the
1428	 * mm layer so, possibly freeing the page cache page first.
1429	 */
1430	page_cache_release(page);
1431	return NULL;
1432}
1433
1434EXPORT_SYMBOL(filemap_nopage);
1435
1436static struct page * filemap_getpage(struct file *file, unsigned long pgoff,
1437					int nonblock)
1438{
1439	struct address_space *mapping = file->f_mapping;
1440	struct page *page;
1441	int error;
1442
1443	/*
1444	 * Do we have something in the page cache already?
1445	 */
1446retry_find:
1447	page = find_get_page(mapping, pgoff);
1448	if (!page) {
1449		if (nonblock)
1450			return NULL;
1451		goto no_cached_page;
1452	}
1453
1454	/*
1455	 * Ok, found a page in the page cache, now we need to check
1456	 * that it's up-to-date.
1457	 */
1458	if (!PageUptodate(page)) {
1459		if (nonblock) {
1460			page_cache_release(page);
1461			return NULL;
1462		}
1463		goto page_not_uptodate;
1464	}
1465
1466success:
1467	/*
1468	 * Found the page and have a reference on it.
1469	 */
1470	mark_page_accessed(page);
1471	return page;
1472
1473no_cached_page:
1474	error = page_cache_read(file, pgoff);
1475
1476	/*
1477	 * The page we want has now been added to the page cache.
1478	 * In the unlikely event that someone removed it in the
1479	 * meantime, we'll just come back here and read it again.
1480	 */
1481	if (error >= 0)
1482		goto retry_find;
1483
1484	/*
1485	 * An error return from page_cache_read can result if the
1486	 * system is low on memory, or a problem occurs while trying
1487	 * to schedule I/O.
1488	 */
1489	return NULL;
1490
1491page_not_uptodate:
1492	lock_page(page);
1493
1494	/* Did it get unhashed while we waited for it? */
1495	if (!page->mapping) {
1496		unlock_page(page);
1497		goto err;
1498	}
1499
1500	/* Did somebody else get it up-to-date? */
1501	if (PageUptodate(page)) {
1502		unlock_page(page);
1503		goto success;
1504	}
1505
1506	error = mapping->a_ops->readpage(file, page);
1507	if (!error) {
1508		wait_on_page_locked(page);
1509		if (PageUptodate(page))
1510			goto success;
1511	} else if (error == AOP_TRUNCATED_PAGE) {
1512		page_cache_release(page);
1513		goto retry_find;
1514	}
1515
1516	/*
1517	 * Umm, take care of errors if the page isn't up-to-date.
1518	 * Try to re-read it _once_. We do this synchronously,
1519	 * because there really aren't any performance issues here
1520	 * and we need to check for errors.
1521	 */
1522	lock_page(page);
1523
1524	/* Somebody truncated the page on us? */
1525	if (!page->mapping) {
1526		unlock_page(page);
1527		goto err;
1528	}
1529	/* Somebody else successfully read it in? */
1530	if (PageUptodate(page)) {
1531		unlock_page(page);
1532		goto success;
1533	}
1534
1535	ClearPageError(page);
1536	error = mapping->a_ops->readpage(file, page);
1537	if (!error) {
1538		wait_on_page_locked(page);
1539		if (PageUptodate(page))
1540			goto success;
1541	} else if (error == AOP_TRUNCATED_PAGE) {
1542		page_cache_release(page);
1543		goto retry_find;
1544	}
1545
1546	/*
1547	 * Things didn't work out. Return zero to tell the
1548	 * mm layer so, possibly freeing the page cache page first.
1549	 */
1550err:
1551	page_cache_release(page);
1552
1553	return NULL;
1554}
1555
1556int filemap_populate(struct vm_area_struct *vma, unsigned long addr,
1557		unsigned long len, pgprot_t prot, unsigned long pgoff,
1558		int nonblock)
1559{
1560	struct file *file = vma->vm_file;
1561	struct address_space *mapping = file->f_mapping;
1562	struct inode *inode = mapping->host;
1563	unsigned long size;
1564	struct mm_struct *mm = vma->vm_mm;
1565	struct page *page;
1566	int err;
1567
1568	if (!nonblock)
1569		force_page_cache_readahead(mapping, vma->vm_file,
1570					pgoff, len >> PAGE_CACHE_SHIFT);
1571
1572repeat:
1573	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1574	if (pgoff + (len >> PAGE_CACHE_SHIFT) > size)
1575		return -EINVAL;
1576
1577	page = filemap_getpage(file, pgoff, nonblock);
1578
1579	/* XXX: This is wrong, a filesystem I/O error may have happened. Fix that as
1580	 * done in shmem_populate calling shmem_getpage */
1581	if (!page && !nonblock)
1582		return -ENOMEM;
1583
1584	if (page) {
1585		err = install_page(mm, vma, addr, page, prot);
1586		if (err) {
1587			page_cache_release(page);
1588			return err;
1589		}
1590	} else if (vma->vm_flags & VM_NONLINEAR) {
1591		/* No page was found just because we can't read it in now (being
1592		 * here implies nonblock != 0), but the page may exist, so set
1593		 * the PTE to fault it in later. */
1594		err = install_file_pte(mm, vma, addr, pgoff, prot);
1595		if (err)
1596			return err;
1597	}
1598
1599	len -= PAGE_SIZE;
1600	addr += PAGE_SIZE;
1601	pgoff++;
1602	if (len)
1603		goto repeat;
1604
1605	return 0;
1606}
1607EXPORT_SYMBOL(filemap_populate);
1608
1609struct vm_operations_struct generic_file_vm_ops = {
1610	.nopage		= filemap_nopage,
1611	.populate	= filemap_populate,
1612};
1613
1614/* This is used for a general mmap of a disk file */
1615
1616int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1617{
1618	struct address_space *mapping = file->f_mapping;
1619
1620	if (!mapping->a_ops->readpage)
1621		return -ENOEXEC;
1622	file_accessed(file);
1623	vma->vm_ops = &generic_file_vm_ops;
1624	return 0;
1625}
1626
1627/*
1628 * This is for filesystems which do not implement ->writepage.
1629 */
1630int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
1631{
1632	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
1633		return -EINVAL;
1634	return generic_file_mmap(file, vma);
1635}
1636#else
1637int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1638{
1639	return -ENOSYS;
1640}
1641int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
1642{
1643	return -ENOSYS;
1644}
1645#endif /* CONFIG_MMU */
1646
1647EXPORT_SYMBOL(generic_file_mmap);
1648EXPORT_SYMBOL(generic_file_readonly_mmap);
1649
1650static inline struct page *__read_cache_page(struct address_space *mapping,
1651				unsigned long index,
1652				int (*filler)(void *,struct page*),
1653				void *data)
1654{
1655	struct page *page, *cached_page = NULL;
1656	int err;
1657repeat:
1658	page = find_get_page(mapping, index);
1659	if (!page) {
1660		if (!cached_page) {
1661			cached_page = page_cache_alloc_cold(mapping);
1662			if (!cached_page)
1663				return ERR_PTR(-ENOMEM);
1664		}
1665		err = add_to_page_cache_lru(cached_page, mapping,
1666					index, GFP_KERNEL);
1667		if (err == -EEXIST)
1668			goto repeat;
1669		if (err < 0) {
1670			/* Presumably ENOMEM for radix tree node */
1671			page_cache_release(cached_page);
1672			return ERR_PTR(err);
1673		}
1674		page = cached_page;
1675		cached_page = NULL;
1676		err = filler(data, page);
1677		if (err < 0) {
1678			page_cache_release(page);
1679			page = ERR_PTR(err);
1680		}
1681	}
1682	if (cached_page)
1683		page_cache_release(cached_page);
1684	return page;
1685}
1686
1687/*
1688 * Read into the page cache. If a page already exists,
1689 * and PageUptodate() is not set, try to fill the page.
1690 */
1691struct page *read_cache_page(struct address_space *mapping,
1692				unsigned long index,
1693				int (*filler)(void *,struct page*),
1694				void *data)
1695{
1696	struct page *page;
1697	int err;
1698
1699retry:
1700	page = __read_cache_page(mapping, index, filler, data);
1701	if (IS_ERR(page))
1702		goto out;
1703	mark_page_accessed(page);
1704	if (PageUptodate(page))
1705		goto out;
1706
1707	lock_page(page);
1708	if (!page->mapping) {
1709		unlock_page(page);
1710		page_cache_release(page);
1711		goto retry;
1712	}
1713	if (PageUptodate(page)) {
1714		unlock_page(page);
1715		goto out;
1716	}
1717	err = filler(data, page);
1718	if (err < 0) {
1719		page_cache_release(page);
1720		page = ERR_PTR(err);
1721	}
1722 out:
1723	return page;
1724}
1725
1726EXPORT_SYMBOL(read_cache_page);
1727
1728/*
1729 * If the page was newly created, increment its refcount and add it to the
1730 * caller's lru-buffering pagevec.  This function is specifically for
1731 * generic_file_write().
1732 */
1733static inline struct page *
1734__grab_cache_page(struct address_space *mapping, unsigned long index,
1735			struct page **cached_page, struct pagevec *lru_pvec)
1736{
1737	int err;
1738	struct page *page;
1739repeat:
1740	page = find_lock_page(mapping, index);
1741	if (!page) {
1742		if (!*cached_page) {
1743			*cached_page = page_cache_alloc(mapping);
1744			if (!*cached_page)
1745				return NULL;
1746		}
1747		err = add_to_page_cache(*cached_page, mapping,
1748					index, GFP_KERNEL);
1749		if (err == -EEXIST)
1750			goto repeat;
1751		if (err == 0) {
1752			page = *cached_page;
1753			page_cache_get(page);
1754			if (!pagevec_add(lru_pvec, page))
1755				__pagevec_lru_add(lru_pvec);
1756			*cached_page = NULL;
1757		}
1758	}
1759	return page;
1760}
1761
1762/*
1763 * The logic we want is
1764 *
1765 *	if suid or (sgid and xgrp)
1766 *		remove privs
1767 */
1768int remove_suid(struct dentry *dentry)
1769{
1770	mode_t mode = dentry->d_inode->i_mode;
1771	int kill = 0;
1772	int result = 0;
1773
1774	/* suid always must be killed */
1775	if (unlikely(mode & S_ISUID))
1776		kill = ATTR_KILL_SUID;
1777
1778	/*
1779	 * sgid without any exec bits is just a mandatory locking mark; leave
1780	 * it alone.  If some exec bits are set, it's a real sgid; kill it.
1781	 */
1782	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1783		kill |= ATTR_KILL_SGID;
1784
1785	if (unlikely(kill && !capable(CAP_FSETID))) {
1786		struct iattr newattrs;
1787
1788		newattrs.ia_valid = ATTR_FORCE | kill;
1789		result = notify_change(dentry, &newattrs);
1790	}
1791	return result;
1792}
1793EXPORT_SYMBOL(remove_suid);
1794
1795size_t
1796__filemap_copy_from_user_iovec(char *vaddr,
1797			const struct iovec *iov, size_t base, size_t bytes)
1798{
1799	size_t copied = 0, left = 0;
1800
1801	while (bytes) {
1802		char __user *buf = iov->iov_base + base;
1803		int copy = min(bytes, iov->iov_len - base);
1804
1805		base = 0;
1806		left = __copy_from_user_inatomic(vaddr, buf, copy);
1807		copied += copy;
1808		bytes -= copy;
1809		vaddr += copy;
1810		iov++;
1811
1812		if (unlikely(left)) {
1813			/* zero the rest of the target like __copy_from_user */
1814			if (bytes)
1815				memset(vaddr, 0, bytes);
1816			break;
1817		}
1818	}
1819	return copied - left;
1820}
1821
1822/*
1823 * Performs necessary checks before doing a write
1824 *
1825 * Can adjust writing position aor amount of bytes to write.
1826 * Returns appropriate error code that caller should return or
1827 * zero in case that write should be allowed.
1828 */
1829inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
1830{
1831	struct inode *inode = file->f_mapping->host;
1832	unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1833
1834        if (unlikely(*pos < 0))
1835                return -EINVAL;
1836
1837	if (!isblk) {
1838		/* FIXME: this is for backwards compatibility with 2.4 */
1839		if (file->f_flags & O_APPEND)
1840                        *pos = i_size_read(inode);
1841
1842		if (limit != RLIM_INFINITY) {
1843			if (*pos >= limit) {
1844				send_sig(SIGXFSZ, current, 0);
1845				return -EFBIG;
1846			}
1847			if (*count > limit - (typeof(limit))*pos) {
1848				*count = limit - (typeof(limit))*pos;
1849			}
1850		}
1851	}
1852
1853	/*
1854	 * LFS rule
1855	 */
1856	if (unlikely(*pos + *count > MAX_NON_LFS &&
1857				!(file->f_flags & O_LARGEFILE))) {
1858		if (*pos >= MAX_NON_LFS) {
1859			send_sig(SIGXFSZ, current, 0);
1860			return -EFBIG;
1861		}
1862		if (*count > MAX_NON_LFS - (unsigned long)*pos) {
1863			*count = MAX_NON_LFS - (unsigned long)*pos;
1864		}
1865	}
1866
1867	/*
1868	 * Are we about to exceed the fs block limit ?
1869	 *
1870	 * If we have written data it becomes a short write.  If we have
1871	 * exceeded without writing data we send a signal and return EFBIG.
1872	 * Linus frestrict idea will clean these up nicely..
1873	 */
1874	if (likely(!isblk)) {
1875		if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
1876			if (*count || *pos > inode->i_sb->s_maxbytes) {
1877				send_sig(SIGXFSZ, current, 0);
1878				return -EFBIG;
1879			}
1880			/* zero-length writes at ->s_maxbytes are OK */
1881		}
1882
1883		if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
1884			*count = inode->i_sb->s_maxbytes - *pos;
1885	} else {
1886		loff_t isize;
1887		if (bdev_read_only(I_BDEV(inode)))
1888			return -EPERM;
1889		isize = i_size_read(inode);
1890		if (*pos >= isize) {
1891			if (*count || *pos > isize)
1892				return -ENOSPC;
1893		}
1894
1895		if (*pos + *count > isize)
1896			*count = isize - *pos;
1897	}
1898	return 0;
1899}
1900EXPORT_SYMBOL(generic_write_checks);
1901
1902ssize_t
1903generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
1904		unsigned long *nr_segs, loff_t pos, loff_t *ppos,
1905		size_t count, size_t ocount)
1906{
1907	struct file	*file = iocb->ki_filp;
1908	struct address_space *mapping = file->f_mapping;
1909	struct inode	*inode = mapping->host;
1910	ssize_t		written;
1911
1912	if (count != ocount)
1913		*nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
1914
1915	written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs);
1916	if (written > 0) {
1917		loff_t end = pos + written;
1918		if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
1919			i_size_write(inode,  end);
1920			mark_inode_dirty(inode);
1921		}
1922		*ppos = end;
1923	}
1924
1925	/*
1926	 * Sync the fs metadata but not the minor inode changes and
1927	 * of course not the data as we did direct DMA for the IO.
1928	 * i_mutex is held, which protects generic_osync_inode() from
1929	 * livelocking.
1930	 */
1931	if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
1932		int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
1933		if (err < 0)
1934			written = err;
1935	}
1936	if (written == count && !is_sync_kiocb(iocb))
1937		written = -EIOCBQUEUED;
1938	return written;
1939}
1940EXPORT_SYMBOL(generic_file_direct_write);
1941
1942ssize_t
1943generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1944		unsigned long nr_segs, loff_t pos, loff_t *ppos,
1945		size_t count, ssize_t written)
1946{
1947	struct file *file = iocb->ki_filp;
1948	struct address_space * mapping = file->f_mapping;
1949	struct address_space_operations *a_ops = mapping->a_ops;
1950	struct inode 	*inode = mapping->host;
1951	long		status = 0;
1952	struct page	*page;
1953	struct page	*cached_page = NULL;
1954	size_t		bytes;
1955	struct pagevec	lru_pvec;
1956	const struct iovec *cur_iov = iov; /* current iovec */
1957	size_t		iov_base = 0;	   /* offset in the current iovec */
1958	char __user	*buf;
1959
1960	pagevec_init(&lru_pvec, 0);
1961
1962	/*
1963	 * handle partial DIO write.  Adjust cur_iov if needed.
1964	 */
1965	if (likely(nr_segs == 1))
1966		buf = iov->iov_base + written;
1967	else {
1968		filemap_set_next_iovec(&cur_iov, &iov_base, written);
1969		buf = cur_iov->iov_base + iov_base;
1970	}
1971
1972	do {
1973		unsigned long index;
1974		unsigned long offset;
1975		unsigned long maxlen;
1976		size_t copied;
1977
1978		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
1979		index = pos >> PAGE_CACHE_SHIFT;
1980		bytes = PAGE_CACHE_SIZE - offset;
1981		if (bytes > count)
1982			bytes = count;
1983
1984		/*
1985		 * Bring in the user page that we will copy from _first_.
1986		 * Otherwise there's a nasty deadlock on copying from the
1987		 * same page as we're writing to, without it being marked
1988		 * up-to-date.
1989		 */
1990		maxlen = cur_iov->iov_len - iov_base;
1991		if (maxlen > bytes)
1992			maxlen = bytes;
1993		fault_in_pages_readable(buf, maxlen);
1994
1995		page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
1996		if (!page) {
1997			status = -ENOMEM;
1998			break;
1999		}
2000
2001		status = a_ops->prepare_write(file, page, offset, offset+bytes);
2002		if (unlikely(status)) {
2003			loff_t isize = i_size_read(inode);
2004
2005			if (status != AOP_TRUNCATED_PAGE)
2006				unlock_page(page);
2007			page_cache_release(page);
2008			if (status == AOP_TRUNCATED_PAGE)
2009				continue;
2010			/*
2011			 * prepare_write() may have instantiated a few blocks
2012			 * outside i_size.  Trim these off again.
2013			 */
2014			if (pos + bytes > isize)
2015				vmtruncate(inode, isize);
2016			break;
2017		}
2018		if (likely(nr_segs == 1))
2019			copied = filemap_copy_from_user(page, offset,
2020							buf, bytes);
2021		else
2022			copied = filemap_copy_from_user_iovec(page, offset,
2023						cur_iov, iov_base, bytes);
2024		flush_dcache_page(page);
2025		status = a_ops->commit_write(file, page, offset, offset+bytes);
2026		if (status == AOP_TRUNCATED_PAGE) {
2027			page_cache_release(page);
2028			continue;
2029		}
2030		if (likely(copied > 0)) {
2031			if (!status)
2032				status = copied;
2033
2034			if (status >= 0) {
2035				written += status;
2036				count -= status;
2037				pos += status;
2038				buf += status;
2039				if (unlikely(nr_segs > 1)) {
2040					filemap_set_next_iovec(&cur_iov,
2041							&iov_base, status);
2042					if (count)
2043						buf = cur_iov->iov_base +
2044							iov_base;
2045				} else {
2046					iov_base += status;
2047				}
2048			}
2049		}
2050		if (unlikely(copied != bytes))
2051			if (status >= 0)
2052				status = -EFAULT;
2053		unlock_page(page);
2054		mark_page_accessed(page);
2055		page_cache_release(page);
2056		if (status < 0)
2057			break;
2058		balance_dirty_pages_ratelimited(mapping);
2059		cond_resched();
2060	} while (count);
2061	*ppos = pos;
2062
2063	if (cached_page)
2064		page_cache_release(cached_page);
2065
2066	/*
2067	 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
2068	 */
2069	if (likely(status >= 0)) {
2070		if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2071			if (!a_ops->writepage || !is_sync_kiocb(iocb))
2072				status = generic_osync_inode(inode, mapping,
2073						OSYNC_METADATA|OSYNC_DATA);
2074		}
2075  	}
2076
2077	/*
2078	 * If we get here for O_DIRECT writes then we must have fallen through
2079	 * to buffered writes (block instantiation inside i_size).  So we sync
2080	 * the file data here, to try to honour O_DIRECT expectations.
2081	 */
2082	if (unlikely(file->f_flags & O_DIRECT) && written)
2083		status = filemap_write_and_wait(mapping);
2084
2085	pagevec_lru_add(&lru_pvec);
2086	return written ? written : status;
2087}
2088EXPORT_SYMBOL(generic_file_buffered_write);
2089
2090static ssize_t
2091__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2092				unsigned long nr_segs, loff_t *ppos)
2093{
2094	struct file *file = iocb->ki_filp;
2095	struct address_space * mapping = file->f_mapping;
2096	size_t ocount;		/* original count */
2097	size_t count;		/* after file limit checks */
2098	struct inode 	*inode = mapping->host;
2099	unsigned long	seg;
2100	loff_t		pos;
2101	ssize_t		written;
2102	ssize_t		err;
2103
2104	ocount = 0;
2105	for (seg = 0; seg < nr_segs; seg++) {
2106		const struct iovec *iv = &iov[seg];
2107
2108		/*
2109		 * If any segment has a negative length, or the cumulative
2110		 * length ever wraps negative then return -EINVAL.
2111		 */
2112		ocount += iv->iov_len;
2113		if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
2114			return -EINVAL;
2115		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
2116			continue;
2117		if (seg == 0)
2118			return -EFAULT;
2119		nr_segs = seg;
2120		ocount -= iv->iov_len;	/* This segment is no good */
2121		break;
2122	}
2123
2124	count = ocount;
2125	pos = *ppos;
2126
2127	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2128
2129	/* We can write back this queue in page reclaim */
2130	current->backing_dev_info = mapping->backing_dev_info;
2131	written = 0;
2132
2133	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2134	if (err)
2135		goto out;
2136
2137	if (count == 0)
2138		goto out;
2139
2140	err = remove_suid(file->f_dentry);
2141	if (err)
2142		goto out;
2143
2144	file_update_time(file);
2145
2146	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2147	if (unlikely(file->f_flags & O_DIRECT)) {
2148		written = generic_file_direct_write(iocb, iov,
2149				&nr_segs, pos, ppos, count, ocount);
2150		if (written < 0 || written == count)
2151			goto out;
2152		/*
2153		 * direct-io write to a hole: fall through to buffered I/O
2154		 * for completing the rest of the request.
2155		 */
2156		pos += written;
2157		count -= written;
2158	}
2159
2160	written = generic_file_buffered_write(iocb, iov, nr_segs,
2161			pos, ppos, count, written);
2162out:
2163	current->backing_dev_info = NULL;
2164	return written ? written : err;
2165}
2166EXPORT_SYMBOL(generic_file_aio_write_nolock);
2167
2168ssize_t
2169generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2170				unsigned long nr_segs, loff_t *ppos)
2171{
2172	struct file *file = iocb->ki_filp;
2173	struct address_space *mapping = file->f_mapping;
2174	struct inode *inode = mapping->host;
2175	ssize_t ret;
2176	loff_t pos = *ppos;
2177
2178	ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos);
2179
2180	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2181		int err;
2182
2183		err = sync_page_range_nolock(inode, mapping, pos, ret);
2184		if (err < 0)
2185			ret = err;
2186	}
2187	return ret;
2188}
2189
2190static ssize_t
2191__generic_file_write_nolock(struct file *file, const struct iovec *iov,
2192				unsigned long nr_segs, loff_t *ppos)
2193{
2194	struct kiocb kiocb;
2195	ssize_t ret;
2196
2197	init_sync_kiocb(&kiocb, file);
2198	ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2199	if (ret == -EIOCBQUEUED)
2200		ret = wait_on_sync_kiocb(&kiocb);
2201	return ret;
2202}
2203
2204ssize_t
2205generic_file_write_nolock(struct file *file, const struct iovec *iov,
2206				unsigned long nr_segs, loff_t *ppos)
2207{
2208	struct kiocb kiocb;
2209	ssize_t ret;
2210
2211	init_sync_kiocb(&kiocb, file);
2212	ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2213	if (-EIOCBQUEUED == ret)
2214		ret = wait_on_sync_kiocb(&kiocb);
2215	return ret;
2216}
2217EXPORT_SYMBOL(generic_file_write_nolock);
2218
2219ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
2220			       size_t count, loff_t pos)
2221{
2222	struct file *file = iocb->ki_filp;
2223	struct address_space *mapping = file->f_mapping;
2224	struct inode *inode = mapping->host;
2225	ssize_t ret;
2226	struct iovec local_iov = { .iov_base = (void __user *)buf,
2227					.iov_len = count };
2228
2229	BUG_ON(iocb->ki_pos != pos);
2230
2231	mutex_lock(&inode->i_mutex);
2232	ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
2233						&iocb->ki_pos);
2234	mutex_unlock(&inode->i_mutex);
2235
2236	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2237		ssize_t err;
2238
2239		err = sync_page_range(inode, mapping, pos, ret);
2240		if (err < 0)
2241			ret = err;
2242	}
2243	return ret;
2244}
2245EXPORT_SYMBOL(generic_file_aio_write);
2246
2247ssize_t generic_file_write(struct file *file, const char __user *buf,
2248			   size_t count, loff_t *ppos)
2249{
2250	struct address_space *mapping = file->f_mapping;
2251	struct inode *inode = mapping->host;
2252	ssize_t	ret;
2253	struct iovec local_iov = { .iov_base = (void __user *)buf,
2254					.iov_len = count };
2255
2256	mutex_lock(&inode->i_mutex);
2257	ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
2258	mutex_unlock(&inode->i_mutex);
2259
2260	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2261		ssize_t err;
2262
2263		err = sync_page_range(inode, mapping, *ppos - ret, ret);
2264		if (err < 0)
2265			ret = err;
2266	}
2267	return ret;
2268}
2269EXPORT_SYMBOL(generic_file_write);
2270
2271ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
2272			unsigned long nr_segs, loff_t *ppos)
2273{
2274	struct kiocb kiocb;
2275	ssize_t ret;
2276
2277	init_sync_kiocb(&kiocb, filp);
2278	ret = __generic_file_aio_read(&kiocb, iov, nr_segs, ppos);
2279	if (-EIOCBQUEUED == ret)
2280		ret = wait_on_sync_kiocb(&kiocb);
2281	return ret;
2282}
2283EXPORT_SYMBOL(generic_file_readv);
2284
2285ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
2286			unsigned long nr_segs, loff_t *ppos)
2287{
2288	struct address_space *mapping = file->f_mapping;
2289	struct inode *inode = mapping->host;
2290	ssize_t ret;
2291
2292	mutex_lock(&inode->i_mutex);
2293	ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
2294	mutex_unlock(&inode->i_mutex);
2295
2296	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2297		int err;
2298
2299		err = sync_page_range(inode, mapping, *ppos - ret, ret);
2300		if (err < 0)
2301			ret = err;
2302	}
2303	return ret;
2304}
2305EXPORT_SYMBOL(generic_file_writev);
2306
2307/*
2308 * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if something
2309 * went wrong during pagecache shootdown.
2310 */
2311static ssize_t
2312generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2313	loff_t offset, unsigned long nr_segs)
2314{
2315	struct file *file = iocb->ki_filp;
2316	struct address_space *mapping = file->f_mapping;
2317	ssize_t retval;
2318	size_t write_len = 0;
2319
2320	/*
2321	 * If it's a write, unmap all mmappings of the file up-front.  This
2322	 * will cause any pte dirty bits to be propagated into the pageframes
2323	 * for the subsequent filemap_write_and_wait().
2324	 */
2325	if (rw == WRITE) {
2326		write_len = iov_length(iov, nr_segs);
2327	       	if (mapping_mapped(mapping))
2328			unmap_mapping_range(mapping, offset, write_len, 0);
2329	}
2330
2331	retval = filemap_write_and_wait(mapping);
2332	if (retval == 0) {
2333		retval = mapping->a_ops->direct_IO(rw, iocb, iov,
2334						offset, nr_segs);
2335		if (rw == WRITE && mapping->nrpages) {
2336			pgoff_t end = (offset + write_len - 1)
2337						>> PAGE_CACHE_SHIFT;
2338			int err = invalidate_inode_pages2_range(mapping,
2339					offset >> PAGE_CACHE_SHIFT, end);
2340			if (err)
2341				retval = err;
2342		}
2343	}
2344	return retval;
2345}
2346