filemap.c revision ac6aadb24b7d4f0e54246732e221c102073412bf
1/* 2 * linux/mm/filemap.c 3 * 4 * Copyright (C) 1994-1999 Linus Torvalds 5 */ 6 7/* 8 * This file handles the generic file mmap semantics used by 9 * most "normal" filesystems (but you don't /have/ to use this: 10 * the NFS filesystem used to do this differently, for example) 11 */ 12#include <linux/module.h> 13#include <linux/slab.h> 14#include <linux/compiler.h> 15#include <linux/fs.h> 16#include <linux/uaccess.h> 17#include <linux/aio.h> 18#include <linux/capability.h> 19#include <linux/kernel_stat.h> 20#include <linux/mm.h> 21#include <linux/swap.h> 22#include <linux/mman.h> 23#include <linux/pagemap.h> 24#include <linux/file.h> 25#include <linux/uio.h> 26#include <linux/hash.h> 27#include <linux/writeback.h> 28#include <linux/backing-dev.h> 29#include <linux/pagevec.h> 30#include <linux/blkdev.h> 31#include <linux/security.h> 32#include <linux/syscalls.h> 33#include <linux/cpuset.h> 34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 35#include <linux/memcontrol.h> 36#include "internal.h" 37 38/* 39 * FIXME: remove all knowledge of the buffer layer from the core VM 40 */ 41#include <linux/buffer_head.h> /* for generic_osync_inode */ 42 43#include <asm/mman.h> 44 45static ssize_t 46generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 47 loff_t offset, unsigned long nr_segs); 48 49/* 50 * Shared mappings implemented 30.11.1994. It's not fully working yet, 51 * though. 52 * 53 * Shared mappings now work. 15.8.1995 Bruno. 54 * 55 * finished 'unifying' the page and buffer cache and SMP-threaded the 56 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> 57 * 58 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> 59 */ 60 61/* 62 * Lock ordering: 63 * 64 * ->i_mmap_lock (vmtruncate) 65 * ->private_lock (__free_pte->__set_page_dirty_buffers) 66 * ->swap_lock (exclusive_swap_page, others) 67 * ->mapping->tree_lock 68 * 69 * ->i_mutex 70 * ->i_mmap_lock (truncate->unmap_mapping_range) 71 * 72 * ->mmap_sem 73 * ->i_mmap_lock 74 * ->page_table_lock or pte_lock (various, mainly in memory.c) 75 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) 76 * 77 * ->mmap_sem 78 * ->lock_page (access_process_vm) 79 * 80 * ->i_mutex (generic_file_buffered_write) 81 * ->mmap_sem (fault_in_pages_readable->do_page_fault) 82 * 83 * ->i_mutex 84 * ->i_alloc_sem (various) 85 * 86 * ->inode_lock 87 * ->sb_lock (fs/fs-writeback.c) 88 * ->mapping->tree_lock (__sync_single_inode) 89 * 90 * ->i_mmap_lock 91 * ->anon_vma.lock (vma_adjust) 92 * 93 * ->anon_vma.lock 94 * ->page_table_lock or pte_lock (anon_vma_prepare and various) 95 * 96 * ->page_table_lock or pte_lock 97 * ->swap_lock (try_to_unmap_one) 98 * ->private_lock (try_to_unmap_one) 99 * ->tree_lock (try_to_unmap_one) 100 * ->zone.lru_lock (follow_page->mark_page_accessed) 101 * ->zone.lru_lock (check_pte_range->isolate_lru_page) 102 * ->private_lock (page_remove_rmap->set_page_dirty) 103 * ->tree_lock (page_remove_rmap->set_page_dirty) 104 * ->inode_lock (page_remove_rmap->set_page_dirty) 105 * ->inode_lock (zap_pte_range->set_page_dirty) 106 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 107 * 108 * ->task->proc_lock 109 * ->dcache_lock (proc_pid_lookup) 110 */ 111 112/* 113 * Remove a page from the page cache and free it. Caller has to make 114 * sure the page is locked and that nobody else uses it - or that usage 115 * is safe. The caller must hold a write_lock on the mapping's tree_lock. 116 */ 117void __remove_from_page_cache(struct page *page) 118{ 119 struct address_space *mapping = page->mapping; 120 121 mem_cgroup_uncharge_page(page); 122 radix_tree_delete(&mapping->page_tree, page->index); 123 page->mapping = NULL; 124 mapping->nrpages--; 125 __dec_zone_page_state(page, NR_FILE_PAGES); 126 BUG_ON(page_mapped(page)); 127 128 /* 129 * Some filesystems seem to re-dirty the page even after 130 * the VM has canceled the dirty bit (eg ext3 journaling). 131 * 132 * Fix it up by doing a final dirty accounting check after 133 * having removed the page entirely. 134 */ 135 if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { 136 dec_zone_page_state(page, NR_FILE_DIRTY); 137 dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); 138 } 139} 140 141void remove_from_page_cache(struct page *page) 142{ 143 struct address_space *mapping = page->mapping; 144 145 BUG_ON(!PageLocked(page)); 146 147 write_lock_irq(&mapping->tree_lock); 148 __remove_from_page_cache(page); 149 write_unlock_irq(&mapping->tree_lock); 150} 151 152static int sync_page(void *word) 153{ 154 struct address_space *mapping; 155 struct page *page; 156 157 page = container_of((unsigned long *)word, struct page, flags); 158 159 /* 160 * page_mapping() is being called without PG_locked held. 161 * Some knowledge of the state and use of the page is used to 162 * reduce the requirements down to a memory barrier. 163 * The danger here is of a stale page_mapping() return value 164 * indicating a struct address_space different from the one it's 165 * associated with when it is associated with one. 166 * After smp_mb(), it's either the correct page_mapping() for 167 * the page, or an old page_mapping() and the page's own 168 * page_mapping() has gone NULL. 169 * The ->sync_page() address_space operation must tolerate 170 * page_mapping() going NULL. By an amazing coincidence, 171 * this comes about because none of the users of the page 172 * in the ->sync_page() methods make essential use of the 173 * page_mapping(), merely passing the page down to the backing 174 * device's unplug functions when it's non-NULL, which in turn 175 * ignore it for all cases but swap, where only page_private(page) is 176 * of interest. When page_mapping() does go NULL, the entire 177 * call stack gracefully ignores the page and returns. 178 * -- wli 179 */ 180 smp_mb(); 181 mapping = page_mapping(page); 182 if (mapping && mapping->a_ops && mapping->a_ops->sync_page) 183 mapping->a_ops->sync_page(page); 184 io_schedule(); 185 return 0; 186} 187 188static int sync_page_killable(void *word) 189{ 190 sync_page(word); 191 return fatal_signal_pending(current) ? -EINTR : 0; 192} 193 194/** 195 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range 196 * @mapping: address space structure to write 197 * @start: offset in bytes where the range starts 198 * @end: offset in bytes where the range ends (inclusive) 199 * @sync_mode: enable synchronous operation 200 * 201 * Start writeback against all of a mapping's dirty pages that lie 202 * within the byte offsets <start, end> inclusive. 203 * 204 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as 205 * opposed to a regular memory cleansing writeback. The difference between 206 * these two operations is that if a dirty page/buffer is encountered, it must 207 * be waited upon, and not just skipped over. 208 */ 209int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 210 loff_t end, int sync_mode) 211{ 212 int ret; 213 struct writeback_control wbc = { 214 .sync_mode = sync_mode, 215 .nr_to_write = mapping->nrpages * 2, 216 .range_start = start, 217 .range_end = end, 218 }; 219 220 if (!mapping_cap_writeback_dirty(mapping)) 221 return 0; 222 223 ret = do_writepages(mapping, &wbc); 224 return ret; 225} 226 227static inline int __filemap_fdatawrite(struct address_space *mapping, 228 int sync_mode) 229{ 230 return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); 231} 232 233int filemap_fdatawrite(struct address_space *mapping) 234{ 235 return __filemap_fdatawrite(mapping, WB_SYNC_ALL); 236} 237EXPORT_SYMBOL(filemap_fdatawrite); 238 239static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, 240 loff_t end) 241{ 242 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); 243} 244 245/** 246 * filemap_flush - mostly a non-blocking flush 247 * @mapping: target address_space 248 * 249 * This is a mostly non-blocking flush. Not suitable for data-integrity 250 * purposes - I/O may not be started against all dirty pages. 251 */ 252int filemap_flush(struct address_space *mapping) 253{ 254 return __filemap_fdatawrite(mapping, WB_SYNC_NONE); 255} 256EXPORT_SYMBOL(filemap_flush); 257 258/** 259 * wait_on_page_writeback_range - wait for writeback to complete 260 * @mapping: target address_space 261 * @start: beginning page index 262 * @end: ending page index 263 * 264 * Wait for writeback to complete against pages indexed by start->end 265 * inclusive 266 */ 267int wait_on_page_writeback_range(struct address_space *mapping, 268 pgoff_t start, pgoff_t end) 269{ 270 struct pagevec pvec; 271 int nr_pages; 272 int ret = 0; 273 pgoff_t index; 274 275 if (end < start) 276 return 0; 277 278 pagevec_init(&pvec, 0); 279 index = start; 280 while ((index <= end) && 281 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 282 PAGECACHE_TAG_WRITEBACK, 283 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { 284 unsigned i; 285 286 for (i = 0; i < nr_pages; i++) { 287 struct page *page = pvec.pages[i]; 288 289 /* until radix tree lookup accepts end_index */ 290 if (page->index > end) 291 continue; 292 293 wait_on_page_writeback(page); 294 if (PageError(page)) 295 ret = -EIO; 296 } 297 pagevec_release(&pvec); 298 cond_resched(); 299 } 300 301 /* Check for outstanding write errors */ 302 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) 303 ret = -ENOSPC; 304 if (test_and_clear_bit(AS_EIO, &mapping->flags)) 305 ret = -EIO; 306 307 return ret; 308} 309 310/** 311 * sync_page_range - write and wait on all pages in the passed range 312 * @inode: target inode 313 * @mapping: target address_space 314 * @pos: beginning offset in pages to write 315 * @count: number of bytes to write 316 * 317 * Write and wait upon all the pages in the passed range. This is a "data 318 * integrity" operation. It waits upon in-flight writeout before starting and 319 * waiting upon new writeout. If there was an IO error, return it. 320 * 321 * We need to re-take i_mutex during the generic_osync_inode list walk because 322 * it is otherwise livelockable. 323 */ 324int sync_page_range(struct inode *inode, struct address_space *mapping, 325 loff_t pos, loff_t count) 326{ 327 pgoff_t start = pos >> PAGE_CACHE_SHIFT; 328 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; 329 int ret; 330 331 if (!mapping_cap_writeback_dirty(mapping) || !count) 332 return 0; 333 ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); 334 if (ret == 0) { 335 mutex_lock(&inode->i_mutex); 336 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); 337 mutex_unlock(&inode->i_mutex); 338 } 339 if (ret == 0) 340 ret = wait_on_page_writeback_range(mapping, start, end); 341 return ret; 342} 343EXPORT_SYMBOL(sync_page_range); 344 345/** 346 * sync_page_range_nolock - write & wait on all pages in the passed range without locking 347 * @inode: target inode 348 * @mapping: target address_space 349 * @pos: beginning offset in pages to write 350 * @count: number of bytes to write 351 * 352 * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea 353 * as it forces O_SYNC writers to different parts of the same file 354 * to be serialised right until io completion. 355 */ 356int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, 357 loff_t pos, loff_t count) 358{ 359 pgoff_t start = pos >> PAGE_CACHE_SHIFT; 360 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; 361 int ret; 362 363 if (!mapping_cap_writeback_dirty(mapping) || !count) 364 return 0; 365 ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); 366 if (ret == 0) 367 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); 368 if (ret == 0) 369 ret = wait_on_page_writeback_range(mapping, start, end); 370 return ret; 371} 372EXPORT_SYMBOL(sync_page_range_nolock); 373 374/** 375 * filemap_fdatawait - wait for all under-writeback pages to complete 376 * @mapping: address space structure to wait for 377 * 378 * Walk the list of under-writeback pages of the given address space 379 * and wait for all of them. 380 */ 381int filemap_fdatawait(struct address_space *mapping) 382{ 383 loff_t i_size = i_size_read(mapping->host); 384 385 if (i_size == 0) 386 return 0; 387 388 return wait_on_page_writeback_range(mapping, 0, 389 (i_size - 1) >> PAGE_CACHE_SHIFT); 390} 391EXPORT_SYMBOL(filemap_fdatawait); 392 393int filemap_write_and_wait(struct address_space *mapping) 394{ 395 int err = 0; 396 397 if (mapping->nrpages) { 398 err = filemap_fdatawrite(mapping); 399 /* 400 * Even if the above returned error, the pages may be 401 * written partially (e.g. -ENOSPC), so we wait for it. 402 * But the -EIO is special case, it may indicate the worst 403 * thing (e.g. bug) happened, so we avoid waiting for it. 404 */ 405 if (err != -EIO) { 406 int err2 = filemap_fdatawait(mapping); 407 if (!err) 408 err = err2; 409 } 410 } 411 return err; 412} 413EXPORT_SYMBOL(filemap_write_and_wait); 414 415/** 416 * filemap_write_and_wait_range - write out & wait on a file range 417 * @mapping: the address_space for the pages 418 * @lstart: offset in bytes where the range starts 419 * @lend: offset in bytes where the range ends (inclusive) 420 * 421 * Write out and wait upon file offsets lstart->lend, inclusive. 422 * 423 * Note that `lend' is inclusive (describes the last byte to be written) so 424 * that this function can be used to write to the very end-of-file (end = -1). 425 */ 426int filemap_write_and_wait_range(struct address_space *mapping, 427 loff_t lstart, loff_t lend) 428{ 429 int err = 0; 430 431 if (mapping->nrpages) { 432 err = __filemap_fdatawrite_range(mapping, lstart, lend, 433 WB_SYNC_ALL); 434 /* See comment of filemap_write_and_wait() */ 435 if (err != -EIO) { 436 int err2 = wait_on_page_writeback_range(mapping, 437 lstart >> PAGE_CACHE_SHIFT, 438 lend >> PAGE_CACHE_SHIFT); 439 if (!err) 440 err = err2; 441 } 442 } 443 return err; 444} 445 446/** 447 * add_to_page_cache - add newly allocated pagecache pages 448 * @page: page to add 449 * @mapping: the page's address_space 450 * @offset: page index 451 * @gfp_mask: page allocation mode 452 * 453 * This function is used to add newly allocated pagecache pages; 454 * the page is new, so we can just run SetPageLocked() against it. 455 * The other page state flags were set by rmqueue(). 456 * 457 * This function does not add the page to the LRU. The caller must do that. 458 */ 459int add_to_page_cache(struct page *page, struct address_space *mapping, 460 pgoff_t offset, gfp_t gfp_mask) 461{ 462 int error = mem_cgroup_cache_charge(page, current->mm, 463 gfp_mask & ~__GFP_HIGHMEM); 464 if (error) 465 goto out; 466 467 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 468 if (error == 0) { 469 write_lock_irq(&mapping->tree_lock); 470 error = radix_tree_insert(&mapping->page_tree, offset, page); 471 if (!error) { 472 page_cache_get(page); 473 SetPageLocked(page); 474 page->mapping = mapping; 475 page->index = offset; 476 mapping->nrpages++; 477 __inc_zone_page_state(page, NR_FILE_PAGES); 478 } else 479 mem_cgroup_uncharge_page(page); 480 481 write_unlock_irq(&mapping->tree_lock); 482 radix_tree_preload_end(); 483 } else 484 mem_cgroup_uncharge_page(page); 485out: 486 return error; 487} 488EXPORT_SYMBOL(add_to_page_cache); 489 490int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 491 pgoff_t offset, gfp_t gfp_mask) 492{ 493 int ret = add_to_page_cache(page, mapping, offset, gfp_mask); 494 if (ret == 0) 495 lru_cache_add(page); 496 return ret; 497} 498 499#ifdef CONFIG_NUMA 500struct page *__page_cache_alloc(gfp_t gfp) 501{ 502 if (cpuset_do_page_mem_spread()) { 503 int n = cpuset_mem_spread_node(); 504 return alloc_pages_node(n, gfp, 0); 505 } 506 return alloc_pages(gfp, 0); 507} 508EXPORT_SYMBOL(__page_cache_alloc); 509#endif 510 511static int __sleep_on_page_lock(void *word) 512{ 513 io_schedule(); 514 return 0; 515} 516 517/* 518 * In order to wait for pages to become available there must be 519 * waitqueues associated with pages. By using a hash table of 520 * waitqueues where the bucket discipline is to maintain all 521 * waiters on the same queue and wake all when any of the pages 522 * become available, and for the woken contexts to check to be 523 * sure the appropriate page became available, this saves space 524 * at a cost of "thundering herd" phenomena during rare hash 525 * collisions. 526 */ 527static wait_queue_head_t *page_waitqueue(struct page *page) 528{ 529 const struct zone *zone = page_zone(page); 530 531 return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; 532} 533 534static inline void wake_up_page(struct page *page, int bit) 535{ 536 __wake_up_bit(page_waitqueue(page), &page->flags, bit); 537} 538 539void wait_on_page_bit(struct page *page, int bit_nr) 540{ 541 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 542 543 if (test_bit(bit_nr, &page->flags)) 544 __wait_on_bit(page_waitqueue(page), &wait, sync_page, 545 TASK_UNINTERRUPTIBLE); 546} 547EXPORT_SYMBOL(wait_on_page_bit); 548 549/** 550 * unlock_page - unlock a locked page 551 * @page: the page 552 * 553 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). 554 * Also wakes sleepers in wait_on_page_writeback() because the wakeup 555 * mechananism between PageLocked pages and PageWriteback pages is shared. 556 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. 557 * 558 * The first mb is necessary to safely close the critical section opened by the 559 * TestSetPageLocked(), the second mb is necessary to enforce ordering between 560 * the clear_bit and the read of the waitqueue (to avoid SMP races with a 561 * parallel wait_on_page_locked()). 562 */ 563void unlock_page(struct page *page) 564{ 565 smp_mb__before_clear_bit(); 566 if (!TestClearPageLocked(page)) 567 BUG(); 568 smp_mb__after_clear_bit(); 569 wake_up_page(page, PG_locked); 570} 571EXPORT_SYMBOL(unlock_page); 572 573/** 574 * end_page_writeback - end writeback against a page 575 * @page: the page 576 */ 577void end_page_writeback(struct page *page) 578{ 579 if (TestClearPageReclaim(page)) 580 rotate_reclaimable_page(page); 581 582 if (!test_clear_page_writeback(page)) 583 BUG(); 584 585 smp_mb__after_clear_bit(); 586 wake_up_page(page, PG_writeback); 587} 588EXPORT_SYMBOL(end_page_writeback); 589 590/** 591 * __lock_page - get a lock on the page, assuming we need to sleep to get it 592 * @page: the page to lock 593 * 594 * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some 595 * random driver's requestfn sets TASK_RUNNING, we could busywait. However 596 * chances are that on the second loop, the block layer's plug list is empty, 597 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. 598 */ 599void __lock_page(struct page *page) 600{ 601 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 602 603 __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, 604 TASK_UNINTERRUPTIBLE); 605} 606EXPORT_SYMBOL(__lock_page); 607 608int __lock_page_killable(struct page *page) 609{ 610 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 611 612 return __wait_on_bit_lock(page_waitqueue(page), &wait, 613 sync_page_killable, TASK_KILLABLE); 614} 615 616/** 617 * __lock_page_nosync - get a lock on the page, without calling sync_page() 618 * @page: the page to lock 619 * 620 * Variant of lock_page that does not require the caller to hold a reference 621 * on the page's mapping. 622 */ 623void __lock_page_nosync(struct page *page) 624{ 625 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 626 __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, 627 TASK_UNINTERRUPTIBLE); 628} 629 630/** 631 * find_get_page - find and get a page reference 632 * @mapping: the address_space to search 633 * @offset: the page index 634 * 635 * Is there a pagecache struct page at the given (mapping, offset) tuple? 636 * If yes, increment its refcount and return it; if no, return NULL. 637 */ 638struct page * find_get_page(struct address_space *mapping, pgoff_t offset) 639{ 640 struct page *page; 641 642 read_lock_irq(&mapping->tree_lock); 643 page = radix_tree_lookup(&mapping->page_tree, offset); 644 if (page) 645 page_cache_get(page); 646 read_unlock_irq(&mapping->tree_lock); 647 return page; 648} 649EXPORT_SYMBOL(find_get_page); 650 651/** 652 * find_lock_page - locate, pin and lock a pagecache page 653 * @mapping: the address_space to search 654 * @offset: the page index 655 * 656 * Locates the desired pagecache page, locks it, increments its reference 657 * count and returns its address. 658 * 659 * Returns zero if the page was not present. find_lock_page() may sleep. 660 */ 661struct page *find_lock_page(struct address_space *mapping, 662 pgoff_t offset) 663{ 664 struct page *page; 665 666repeat: 667 read_lock_irq(&mapping->tree_lock); 668 page = radix_tree_lookup(&mapping->page_tree, offset); 669 if (page) { 670 page_cache_get(page); 671 if (TestSetPageLocked(page)) { 672 read_unlock_irq(&mapping->tree_lock); 673 __lock_page(page); 674 675 /* Has the page been truncated while we slept? */ 676 if (unlikely(page->mapping != mapping)) { 677 unlock_page(page); 678 page_cache_release(page); 679 goto repeat; 680 } 681 VM_BUG_ON(page->index != offset); 682 goto out; 683 } 684 } 685 read_unlock_irq(&mapping->tree_lock); 686out: 687 return page; 688} 689EXPORT_SYMBOL(find_lock_page); 690 691/** 692 * find_or_create_page - locate or add a pagecache page 693 * @mapping: the page's address_space 694 * @index: the page's index into the mapping 695 * @gfp_mask: page allocation mode 696 * 697 * Locates a page in the pagecache. If the page is not present, a new page 698 * is allocated using @gfp_mask and is added to the pagecache and to the VM's 699 * LRU list. The returned page is locked and has its reference count 700 * incremented. 701 * 702 * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic 703 * allocation! 704 * 705 * find_or_create_page() returns the desired page's address, or zero on 706 * memory exhaustion. 707 */ 708struct page *find_or_create_page(struct address_space *mapping, 709 pgoff_t index, gfp_t gfp_mask) 710{ 711 struct page *page; 712 int err; 713repeat: 714 page = find_lock_page(mapping, index); 715 if (!page) { 716 page = __page_cache_alloc(gfp_mask); 717 if (!page) 718 return NULL; 719 err = add_to_page_cache_lru(page, mapping, index, gfp_mask); 720 if (unlikely(err)) { 721 page_cache_release(page); 722 page = NULL; 723 if (err == -EEXIST) 724 goto repeat; 725 } 726 } 727 return page; 728} 729EXPORT_SYMBOL(find_or_create_page); 730 731/** 732 * find_get_pages - gang pagecache lookup 733 * @mapping: The address_space to search 734 * @start: The starting page index 735 * @nr_pages: The maximum number of pages 736 * @pages: Where the resulting pages are placed 737 * 738 * find_get_pages() will search for and return a group of up to 739 * @nr_pages pages in the mapping. The pages are placed at @pages. 740 * find_get_pages() takes a reference against the returned pages. 741 * 742 * The search returns a group of mapping-contiguous pages with ascending 743 * indexes. There may be holes in the indices due to not-present pages. 744 * 745 * find_get_pages() returns the number of pages which were found. 746 */ 747unsigned find_get_pages(struct address_space *mapping, pgoff_t start, 748 unsigned int nr_pages, struct page **pages) 749{ 750 unsigned int i; 751 unsigned int ret; 752 753 read_lock_irq(&mapping->tree_lock); 754 ret = radix_tree_gang_lookup(&mapping->page_tree, 755 (void **)pages, start, nr_pages); 756 for (i = 0; i < ret; i++) 757 page_cache_get(pages[i]); 758 read_unlock_irq(&mapping->tree_lock); 759 return ret; 760} 761 762/** 763 * find_get_pages_contig - gang contiguous pagecache lookup 764 * @mapping: The address_space to search 765 * @index: The starting page index 766 * @nr_pages: The maximum number of pages 767 * @pages: Where the resulting pages are placed 768 * 769 * find_get_pages_contig() works exactly like find_get_pages(), except 770 * that the returned number of pages are guaranteed to be contiguous. 771 * 772 * find_get_pages_contig() returns the number of pages which were found. 773 */ 774unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, 775 unsigned int nr_pages, struct page **pages) 776{ 777 unsigned int i; 778 unsigned int ret; 779 780 read_lock_irq(&mapping->tree_lock); 781 ret = radix_tree_gang_lookup(&mapping->page_tree, 782 (void **)pages, index, nr_pages); 783 for (i = 0; i < ret; i++) { 784 if (pages[i]->mapping == NULL || pages[i]->index != index) 785 break; 786 787 page_cache_get(pages[i]); 788 index++; 789 } 790 read_unlock_irq(&mapping->tree_lock); 791 return i; 792} 793EXPORT_SYMBOL(find_get_pages_contig); 794 795/** 796 * find_get_pages_tag - find and return pages that match @tag 797 * @mapping: the address_space to search 798 * @index: the starting page index 799 * @tag: the tag index 800 * @nr_pages: the maximum number of pages 801 * @pages: where the resulting pages are placed 802 * 803 * Like find_get_pages, except we only return pages which are tagged with 804 * @tag. We update @index to index the next page for the traversal. 805 */ 806unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, 807 int tag, unsigned int nr_pages, struct page **pages) 808{ 809 unsigned int i; 810 unsigned int ret; 811 812 read_lock_irq(&mapping->tree_lock); 813 ret = radix_tree_gang_lookup_tag(&mapping->page_tree, 814 (void **)pages, *index, nr_pages, tag); 815 for (i = 0; i < ret; i++) 816 page_cache_get(pages[i]); 817 if (ret) 818 *index = pages[ret - 1]->index + 1; 819 read_unlock_irq(&mapping->tree_lock); 820 return ret; 821} 822EXPORT_SYMBOL(find_get_pages_tag); 823 824/** 825 * grab_cache_page_nowait - returns locked page at given index in given cache 826 * @mapping: target address_space 827 * @index: the page index 828 * 829 * Same as grab_cache_page(), but do not wait if the page is unavailable. 830 * This is intended for speculative data generators, where the data can 831 * be regenerated if the page couldn't be grabbed. This routine should 832 * be safe to call while holding the lock for another page. 833 * 834 * Clear __GFP_FS when allocating the page to avoid recursion into the fs 835 * and deadlock against the caller's locked page. 836 */ 837struct page * 838grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) 839{ 840 struct page *page = find_get_page(mapping, index); 841 842 if (page) { 843 if (!TestSetPageLocked(page)) 844 return page; 845 page_cache_release(page); 846 return NULL; 847 } 848 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); 849 if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) { 850 page_cache_release(page); 851 page = NULL; 852 } 853 return page; 854} 855EXPORT_SYMBOL(grab_cache_page_nowait); 856 857/* 858 * CD/DVDs are error prone. When a medium error occurs, the driver may fail 859 * a _large_ part of the i/o request. Imagine the worst scenario: 860 * 861 * ---R__________________________________________B__________ 862 * ^ reading here ^ bad block(assume 4k) 863 * 864 * read(R) => miss => readahead(R...B) => media error => frustrating retries 865 * => failing the whole request => read(R) => read(R+1) => 866 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => 867 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => 868 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... 869 * 870 * It is going insane. Fix it by quickly scaling down the readahead size. 871 */ 872static void shrink_readahead_size_eio(struct file *filp, 873 struct file_ra_state *ra) 874{ 875 if (!ra->ra_pages) 876 return; 877 878 ra->ra_pages /= 4; 879} 880 881/** 882 * do_generic_file_read - generic file read routine 883 * @filp: the file to read 884 * @ppos: current file position 885 * @desc: read_descriptor 886 * @actor: read method 887 * 888 * This is a generic file read routine, and uses the 889 * mapping->a_ops->readpage() function for the actual low-level stuff. 890 * 891 * This is really ugly. But the goto's actually try to clarify some 892 * of the logic when it comes to error handling etc. 893 */ 894static void do_generic_file_read(struct file *filp, loff_t *ppos, 895 read_descriptor_t *desc, read_actor_t actor) 896{ 897 struct address_space *mapping = filp->f_mapping; 898 struct inode *inode = mapping->host; 899 struct file_ra_state *ra = &filp->f_ra; 900 pgoff_t index; 901 pgoff_t last_index; 902 pgoff_t prev_index; 903 unsigned long offset; /* offset into pagecache page */ 904 unsigned int prev_offset; 905 int error; 906 907 index = *ppos >> PAGE_CACHE_SHIFT; 908 prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; 909 prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); 910 last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 911 offset = *ppos & ~PAGE_CACHE_MASK; 912 913 for (;;) { 914 struct page *page; 915 pgoff_t end_index; 916 loff_t isize; 917 unsigned long nr, ret; 918 919 cond_resched(); 920find_page: 921 page = find_get_page(mapping, index); 922 if (!page) { 923 page_cache_sync_readahead(mapping, 924 ra, filp, 925 index, last_index - index); 926 page = find_get_page(mapping, index); 927 if (unlikely(page == NULL)) 928 goto no_cached_page; 929 } 930 if (PageReadahead(page)) { 931 page_cache_async_readahead(mapping, 932 ra, filp, page, 933 index, last_index - index); 934 } 935 if (!PageUptodate(page)) 936 goto page_not_up_to_date; 937page_ok: 938 /* 939 * i_size must be checked after we know the page is Uptodate. 940 * 941 * Checking i_size after the check allows us to calculate 942 * the correct value for "nr", which means the zero-filled 943 * part of the page is not copied back to userspace (unless 944 * another truncate extends the file - this is desired though). 945 */ 946 947 isize = i_size_read(inode); 948 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 949 if (unlikely(!isize || index > end_index)) { 950 page_cache_release(page); 951 goto out; 952 } 953 954 /* nr is the maximum number of bytes to copy from this page */ 955 nr = PAGE_CACHE_SIZE; 956 if (index == end_index) { 957 nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; 958 if (nr <= offset) { 959 page_cache_release(page); 960 goto out; 961 } 962 } 963 nr = nr - offset; 964 965 /* If users can be writing to this page using arbitrary 966 * virtual addresses, take care about potential aliasing 967 * before reading the page on the kernel side. 968 */ 969 if (mapping_writably_mapped(mapping)) 970 flush_dcache_page(page); 971 972 /* 973 * When a sequential read accesses a page several times, 974 * only mark it as accessed the first time. 975 */ 976 if (prev_index != index || offset != prev_offset) 977 mark_page_accessed(page); 978 prev_index = index; 979 980 /* 981 * Ok, we have the page, and it's up-to-date, so 982 * now we can copy it to user space... 983 * 984 * The actor routine returns how many bytes were actually used.. 985 * NOTE! This may not be the same as how much of a user buffer 986 * we filled up (we may be padding etc), so we can only update 987 * "pos" here (the actor routine has to update the user buffer 988 * pointers and the remaining count). 989 */ 990 ret = actor(desc, page, offset, nr); 991 offset += ret; 992 index += offset >> PAGE_CACHE_SHIFT; 993 offset &= ~PAGE_CACHE_MASK; 994 prev_offset = offset; 995 996 page_cache_release(page); 997 if (ret == nr && desc->count) 998 continue; 999 goto out; 1000 1001page_not_up_to_date: 1002 /* Get exclusive access to the page ... */ 1003 if (lock_page_killable(page)) 1004 goto readpage_eio; 1005 1006 /* Did it get truncated before we got the lock? */ 1007 if (!page->mapping) { 1008 unlock_page(page); 1009 page_cache_release(page); 1010 continue; 1011 } 1012 1013 /* Did somebody else fill it already? */ 1014 if (PageUptodate(page)) { 1015 unlock_page(page); 1016 goto page_ok; 1017 } 1018 1019readpage: 1020 /* Start the actual read. The read will unlock the page. */ 1021 error = mapping->a_ops->readpage(filp, page); 1022 1023 if (unlikely(error)) { 1024 if (error == AOP_TRUNCATED_PAGE) { 1025 page_cache_release(page); 1026 goto find_page; 1027 } 1028 goto readpage_error; 1029 } 1030 1031 if (!PageUptodate(page)) { 1032 if (lock_page_killable(page)) 1033 goto readpage_eio; 1034 if (!PageUptodate(page)) { 1035 if (page->mapping == NULL) { 1036 /* 1037 * invalidate_inode_pages got it 1038 */ 1039 unlock_page(page); 1040 page_cache_release(page); 1041 goto find_page; 1042 } 1043 unlock_page(page); 1044 shrink_readahead_size_eio(filp, ra); 1045 goto readpage_eio; 1046 } 1047 unlock_page(page); 1048 } 1049 1050 goto page_ok; 1051 1052readpage_eio: 1053 error = -EIO; 1054readpage_error: 1055 /* UHHUH! A synchronous read error occurred. Report it */ 1056 desc->error = error; 1057 page_cache_release(page); 1058 goto out; 1059 1060no_cached_page: 1061 /* 1062 * Ok, it wasn't cached, so we need to create a new 1063 * page.. 1064 */ 1065 page = page_cache_alloc_cold(mapping); 1066 if (!page) { 1067 desc->error = -ENOMEM; 1068 goto out; 1069 } 1070 error = add_to_page_cache_lru(page, mapping, 1071 index, GFP_KERNEL); 1072 if (error) { 1073 page_cache_release(page); 1074 if (error == -EEXIST) 1075 goto find_page; 1076 desc->error = error; 1077 goto out; 1078 } 1079 goto readpage; 1080 } 1081 1082out: 1083 ra->prev_pos = prev_index; 1084 ra->prev_pos <<= PAGE_CACHE_SHIFT; 1085 ra->prev_pos |= prev_offset; 1086 1087 *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; 1088 if (filp) 1089 file_accessed(filp); 1090} 1091 1092int file_read_actor(read_descriptor_t *desc, struct page *page, 1093 unsigned long offset, unsigned long size) 1094{ 1095 char *kaddr; 1096 unsigned long left, count = desc->count; 1097 1098 if (size > count) 1099 size = count; 1100 1101 /* 1102 * Faults on the destination of a read are common, so do it before 1103 * taking the kmap. 1104 */ 1105 if (!fault_in_pages_writeable(desc->arg.buf, size)) { 1106 kaddr = kmap_atomic(page, KM_USER0); 1107 left = __copy_to_user_inatomic(desc->arg.buf, 1108 kaddr + offset, size); 1109 kunmap_atomic(kaddr, KM_USER0); 1110 if (left == 0) 1111 goto success; 1112 } 1113 1114 /* Do it the slow way */ 1115 kaddr = kmap(page); 1116 left = __copy_to_user(desc->arg.buf, kaddr + offset, size); 1117 kunmap(page); 1118 1119 if (left) { 1120 size -= left; 1121 desc->error = -EFAULT; 1122 } 1123success: 1124 desc->count = count - size; 1125 desc->written += size; 1126 desc->arg.buf += size; 1127 return size; 1128} 1129 1130/* 1131 * Performs necessary checks before doing a write 1132 * @iov: io vector request 1133 * @nr_segs: number of segments in the iovec 1134 * @count: number of bytes to write 1135 * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE 1136 * 1137 * Adjust number of segments and amount of bytes to write (nr_segs should be 1138 * properly initialized first). Returns appropriate error code that caller 1139 * should return or zero in case that write should be allowed. 1140 */ 1141int generic_segment_checks(const struct iovec *iov, 1142 unsigned long *nr_segs, size_t *count, int access_flags) 1143{ 1144 unsigned long seg; 1145 size_t cnt = 0; 1146 for (seg = 0; seg < *nr_segs; seg++) { 1147 const struct iovec *iv = &iov[seg]; 1148 1149 /* 1150 * If any segment has a negative length, or the cumulative 1151 * length ever wraps negative then return -EINVAL. 1152 */ 1153 cnt += iv->iov_len; 1154 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0)) 1155 return -EINVAL; 1156 if (access_ok(access_flags, iv->iov_base, iv->iov_len)) 1157 continue; 1158 if (seg == 0) 1159 return -EFAULT; 1160 *nr_segs = seg; 1161 cnt -= iv->iov_len; /* This segment is no good */ 1162 break; 1163 } 1164 *count = cnt; 1165 return 0; 1166} 1167EXPORT_SYMBOL(generic_segment_checks); 1168 1169/** 1170 * generic_file_aio_read - generic filesystem read routine 1171 * @iocb: kernel I/O control block 1172 * @iov: io vector request 1173 * @nr_segs: number of segments in the iovec 1174 * @pos: current file position 1175 * 1176 * This is the "read()" routine for all filesystems 1177 * that can use the page cache directly. 1178 */ 1179ssize_t 1180generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, 1181 unsigned long nr_segs, loff_t pos) 1182{ 1183 struct file *filp = iocb->ki_filp; 1184 ssize_t retval; 1185 unsigned long seg; 1186 size_t count; 1187 loff_t *ppos = &iocb->ki_pos; 1188 1189 count = 0; 1190 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); 1191 if (retval) 1192 return retval; 1193 1194 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 1195 if (filp->f_flags & O_DIRECT) { 1196 loff_t size; 1197 struct address_space *mapping; 1198 struct inode *inode; 1199 1200 mapping = filp->f_mapping; 1201 inode = mapping->host; 1202 retval = 0; 1203 if (!count) 1204 goto out; /* skip atime */ 1205 size = i_size_read(inode); 1206 if (pos < size) { 1207 retval = generic_file_direct_IO(READ, iocb, 1208 iov, pos, nr_segs); 1209 if (retval > 0) 1210 *ppos = pos + retval; 1211 } 1212 if (likely(retval != 0)) { 1213 file_accessed(filp); 1214 goto out; 1215 } 1216 } 1217 1218 retval = 0; 1219 if (count) { 1220 for (seg = 0; seg < nr_segs; seg++) { 1221 read_descriptor_t desc; 1222 1223 desc.written = 0; 1224 desc.arg.buf = iov[seg].iov_base; 1225 desc.count = iov[seg].iov_len; 1226 if (desc.count == 0) 1227 continue; 1228 desc.error = 0; 1229 do_generic_file_read(filp,ppos,&desc,file_read_actor); 1230 retval += desc.written; 1231 if (desc.error) { 1232 retval = retval ?: desc.error; 1233 break; 1234 } 1235 if (desc.count > 0) 1236 break; 1237 } 1238 } 1239out: 1240 return retval; 1241} 1242EXPORT_SYMBOL(generic_file_aio_read); 1243 1244static ssize_t 1245do_readahead(struct address_space *mapping, struct file *filp, 1246 pgoff_t index, unsigned long nr) 1247{ 1248 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) 1249 return -EINVAL; 1250 1251 force_page_cache_readahead(mapping, filp, index, 1252 max_sane_readahead(nr)); 1253 return 0; 1254} 1255 1256asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) 1257{ 1258 ssize_t ret; 1259 struct file *file; 1260 1261 ret = -EBADF; 1262 file = fget(fd); 1263 if (file) { 1264 if (file->f_mode & FMODE_READ) { 1265 struct address_space *mapping = file->f_mapping; 1266 pgoff_t start = offset >> PAGE_CACHE_SHIFT; 1267 pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; 1268 unsigned long len = end - start + 1; 1269 ret = do_readahead(mapping, file, start, len); 1270 } 1271 fput(file); 1272 } 1273 return ret; 1274} 1275 1276#ifdef CONFIG_MMU 1277/** 1278 * page_cache_read - adds requested page to the page cache if not already there 1279 * @file: file to read 1280 * @offset: page index 1281 * 1282 * This adds the requested page to the page cache if it isn't already there, 1283 * and schedules an I/O to read in its contents from disk. 1284 */ 1285static int page_cache_read(struct file *file, pgoff_t offset) 1286{ 1287 struct address_space *mapping = file->f_mapping; 1288 struct page *page; 1289 int ret; 1290 1291 do { 1292 page = page_cache_alloc_cold(mapping); 1293 if (!page) 1294 return -ENOMEM; 1295 1296 ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); 1297 if (ret == 0) 1298 ret = mapping->a_ops->readpage(file, page); 1299 else if (ret == -EEXIST) 1300 ret = 0; /* losing race to add is OK */ 1301 1302 page_cache_release(page); 1303 1304 } while (ret == AOP_TRUNCATED_PAGE); 1305 1306 return ret; 1307} 1308 1309#define MMAP_LOTSAMISS (100) 1310 1311/** 1312 * filemap_fault - read in file data for page fault handling 1313 * @vma: vma in which the fault was taken 1314 * @vmf: struct vm_fault containing details of the fault 1315 * 1316 * filemap_fault() is invoked via the vma operations vector for a 1317 * mapped memory region to read in file data during a page fault. 1318 * 1319 * The goto's are kind of ugly, but this streamlines the normal case of having 1320 * it in the page cache, and handles the special cases reasonably without 1321 * having a lot of duplicated code. 1322 */ 1323int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1324{ 1325 int error; 1326 struct file *file = vma->vm_file; 1327 struct address_space *mapping = file->f_mapping; 1328 struct file_ra_state *ra = &file->f_ra; 1329 struct inode *inode = mapping->host; 1330 struct page *page; 1331 pgoff_t size; 1332 int did_readaround = 0; 1333 int ret = 0; 1334 1335 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1336 if (vmf->pgoff >= size) 1337 return VM_FAULT_SIGBUS; 1338 1339 /* If we don't want any read-ahead, don't bother */ 1340 if (VM_RandomReadHint(vma)) 1341 goto no_cached_page; 1342 1343 /* 1344 * Do we have something in the page cache already? 1345 */ 1346retry_find: 1347 page = find_lock_page(mapping, vmf->pgoff); 1348 /* 1349 * For sequential accesses, we use the generic readahead logic. 1350 */ 1351 if (VM_SequentialReadHint(vma)) { 1352 if (!page) { 1353 page_cache_sync_readahead(mapping, ra, file, 1354 vmf->pgoff, 1); 1355 page = find_lock_page(mapping, vmf->pgoff); 1356 if (!page) 1357 goto no_cached_page; 1358 } 1359 if (PageReadahead(page)) { 1360 page_cache_async_readahead(mapping, ra, file, page, 1361 vmf->pgoff, 1); 1362 } 1363 } 1364 1365 if (!page) { 1366 unsigned long ra_pages; 1367 1368 ra->mmap_miss++; 1369 1370 /* 1371 * Do we miss much more than hit in this file? If so, 1372 * stop bothering with read-ahead. It will only hurt. 1373 */ 1374 if (ra->mmap_miss > MMAP_LOTSAMISS) 1375 goto no_cached_page; 1376 1377 /* 1378 * To keep the pgmajfault counter straight, we need to 1379 * check did_readaround, as this is an inner loop. 1380 */ 1381 if (!did_readaround) { 1382 ret = VM_FAULT_MAJOR; 1383 count_vm_event(PGMAJFAULT); 1384 } 1385 did_readaround = 1; 1386 ra_pages = max_sane_readahead(file->f_ra.ra_pages); 1387 if (ra_pages) { 1388 pgoff_t start = 0; 1389 1390 if (vmf->pgoff > ra_pages / 2) 1391 start = vmf->pgoff - ra_pages / 2; 1392 do_page_cache_readahead(mapping, file, start, ra_pages); 1393 } 1394 page = find_lock_page(mapping, vmf->pgoff); 1395 if (!page) 1396 goto no_cached_page; 1397 } 1398 1399 if (!did_readaround) 1400 ra->mmap_miss--; 1401 1402 /* 1403 * We have a locked page in the page cache, now we need to check 1404 * that it's up-to-date. If not, it is going to be due to an error. 1405 */ 1406 if (unlikely(!PageUptodate(page))) 1407 goto page_not_uptodate; 1408 1409 /* Must recheck i_size under page lock */ 1410 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1411 if (unlikely(vmf->pgoff >= size)) { 1412 unlock_page(page); 1413 page_cache_release(page); 1414 return VM_FAULT_SIGBUS; 1415 } 1416 1417 /* 1418 * Found the page and have a reference on it. 1419 */ 1420 mark_page_accessed(page); 1421 ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; 1422 vmf->page = page; 1423 return ret | VM_FAULT_LOCKED; 1424 1425no_cached_page: 1426 /* 1427 * We're only likely to ever get here if MADV_RANDOM is in 1428 * effect. 1429 */ 1430 error = page_cache_read(file, vmf->pgoff); 1431 1432 /* 1433 * The page we want has now been added to the page cache. 1434 * In the unlikely event that someone removed it in the 1435 * meantime, we'll just come back here and read it again. 1436 */ 1437 if (error >= 0) 1438 goto retry_find; 1439 1440 /* 1441 * An error return from page_cache_read can result if the 1442 * system is low on memory, or a problem occurs while trying 1443 * to schedule I/O. 1444 */ 1445 if (error == -ENOMEM) 1446 return VM_FAULT_OOM; 1447 return VM_FAULT_SIGBUS; 1448 1449page_not_uptodate: 1450 /* IO error path */ 1451 if (!did_readaround) { 1452 ret = VM_FAULT_MAJOR; 1453 count_vm_event(PGMAJFAULT); 1454 } 1455 1456 /* 1457 * Umm, take care of errors if the page isn't up-to-date. 1458 * Try to re-read it _once_. We do this synchronously, 1459 * because there really aren't any performance issues here 1460 * and we need to check for errors. 1461 */ 1462 ClearPageError(page); 1463 error = mapping->a_ops->readpage(file, page); 1464 page_cache_release(page); 1465 1466 if (!error || error == AOP_TRUNCATED_PAGE) 1467 goto retry_find; 1468 1469 /* Things didn't work out. Return zero to tell the mm layer so. */ 1470 shrink_readahead_size_eio(file, ra); 1471 return VM_FAULT_SIGBUS; 1472} 1473EXPORT_SYMBOL(filemap_fault); 1474 1475struct vm_operations_struct generic_file_vm_ops = { 1476 .fault = filemap_fault, 1477}; 1478 1479/* This is used for a general mmap of a disk file */ 1480 1481int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 1482{ 1483 struct address_space *mapping = file->f_mapping; 1484 1485 if (!mapping->a_ops->readpage) 1486 return -ENOEXEC; 1487 file_accessed(file); 1488 vma->vm_ops = &generic_file_vm_ops; 1489 vma->vm_flags |= VM_CAN_NONLINEAR; 1490 return 0; 1491} 1492 1493/* 1494 * This is for filesystems which do not implement ->writepage. 1495 */ 1496int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) 1497{ 1498 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) 1499 return -EINVAL; 1500 return generic_file_mmap(file, vma); 1501} 1502#else 1503int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 1504{ 1505 return -ENOSYS; 1506} 1507int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) 1508{ 1509 return -ENOSYS; 1510} 1511#endif /* CONFIG_MMU */ 1512 1513EXPORT_SYMBOL(generic_file_mmap); 1514EXPORT_SYMBOL(generic_file_readonly_mmap); 1515 1516static struct page *__read_cache_page(struct address_space *mapping, 1517 pgoff_t index, 1518 int (*filler)(void *,struct page*), 1519 void *data) 1520{ 1521 struct page *page; 1522 int err; 1523repeat: 1524 page = find_get_page(mapping, index); 1525 if (!page) { 1526 page = page_cache_alloc_cold(mapping); 1527 if (!page) 1528 return ERR_PTR(-ENOMEM); 1529 err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); 1530 if (unlikely(err)) { 1531 page_cache_release(page); 1532 if (err == -EEXIST) 1533 goto repeat; 1534 /* Presumably ENOMEM for radix tree node */ 1535 return ERR_PTR(err); 1536 } 1537 err = filler(data, page); 1538 if (err < 0) { 1539 page_cache_release(page); 1540 page = ERR_PTR(err); 1541 } 1542 } 1543 return page; 1544} 1545 1546/** 1547 * read_cache_page_async - read into page cache, fill it if needed 1548 * @mapping: the page's address_space 1549 * @index: the page index 1550 * @filler: function to perform the read 1551 * @data: destination for read data 1552 * 1553 * Same as read_cache_page, but don't wait for page to become unlocked 1554 * after submitting it to the filler. 1555 * 1556 * Read into the page cache. If a page already exists, and PageUptodate() is 1557 * not set, try to fill the page but don't wait for it to become unlocked. 1558 * 1559 * If the page does not get brought uptodate, return -EIO. 1560 */ 1561struct page *read_cache_page_async(struct address_space *mapping, 1562 pgoff_t index, 1563 int (*filler)(void *,struct page*), 1564 void *data) 1565{ 1566 struct page *page; 1567 int err; 1568 1569retry: 1570 page = __read_cache_page(mapping, index, filler, data); 1571 if (IS_ERR(page)) 1572 return page; 1573 if (PageUptodate(page)) 1574 goto out; 1575 1576 lock_page(page); 1577 if (!page->mapping) { 1578 unlock_page(page); 1579 page_cache_release(page); 1580 goto retry; 1581 } 1582 if (PageUptodate(page)) { 1583 unlock_page(page); 1584 goto out; 1585 } 1586 err = filler(data, page); 1587 if (err < 0) { 1588 page_cache_release(page); 1589 return ERR_PTR(err); 1590 } 1591out: 1592 mark_page_accessed(page); 1593 return page; 1594} 1595EXPORT_SYMBOL(read_cache_page_async); 1596 1597/** 1598 * read_cache_page - read into page cache, fill it if needed 1599 * @mapping: the page's address_space 1600 * @index: the page index 1601 * @filler: function to perform the read 1602 * @data: destination for read data 1603 * 1604 * Read into the page cache. If a page already exists, and PageUptodate() is 1605 * not set, try to fill the page then wait for it to become unlocked. 1606 * 1607 * If the page does not get brought uptodate, return -EIO. 1608 */ 1609struct page *read_cache_page(struct address_space *mapping, 1610 pgoff_t index, 1611 int (*filler)(void *,struct page*), 1612 void *data) 1613{ 1614 struct page *page; 1615 1616 page = read_cache_page_async(mapping, index, filler, data); 1617 if (IS_ERR(page)) 1618 goto out; 1619 wait_on_page_locked(page); 1620 if (!PageUptodate(page)) { 1621 page_cache_release(page); 1622 page = ERR_PTR(-EIO); 1623 } 1624 out: 1625 return page; 1626} 1627EXPORT_SYMBOL(read_cache_page); 1628 1629/* 1630 * The logic we want is 1631 * 1632 * if suid or (sgid and xgrp) 1633 * remove privs 1634 */ 1635int should_remove_suid(struct dentry *dentry) 1636{ 1637 mode_t mode = dentry->d_inode->i_mode; 1638 int kill = 0; 1639 1640 /* suid always must be killed */ 1641 if (unlikely(mode & S_ISUID)) 1642 kill = ATTR_KILL_SUID; 1643 1644 /* 1645 * sgid without any exec bits is just a mandatory locking mark; leave 1646 * it alone. If some exec bits are set, it's a real sgid; kill it. 1647 */ 1648 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) 1649 kill |= ATTR_KILL_SGID; 1650 1651 if (unlikely(kill && !capable(CAP_FSETID))) 1652 return kill; 1653 1654 return 0; 1655} 1656EXPORT_SYMBOL(should_remove_suid); 1657 1658int __remove_suid(struct dentry *dentry, int kill) 1659{ 1660 struct iattr newattrs; 1661 1662 newattrs.ia_valid = ATTR_FORCE | kill; 1663 return notify_change(dentry, &newattrs); 1664} 1665 1666int remove_suid(struct dentry *dentry) 1667{ 1668 int killsuid = should_remove_suid(dentry); 1669 int killpriv = security_inode_need_killpriv(dentry); 1670 int error = 0; 1671 1672 if (killpriv < 0) 1673 return killpriv; 1674 if (killpriv) 1675 error = security_inode_killpriv(dentry); 1676 if (!error && killsuid) 1677 error = __remove_suid(dentry, killsuid); 1678 1679 return error; 1680} 1681EXPORT_SYMBOL(remove_suid); 1682 1683static size_t __iovec_copy_from_user_inatomic(char *vaddr, 1684 const struct iovec *iov, size_t base, size_t bytes) 1685{ 1686 size_t copied = 0, left = 0; 1687 1688 while (bytes) { 1689 char __user *buf = iov->iov_base + base; 1690 int copy = min(bytes, iov->iov_len - base); 1691 1692 base = 0; 1693 left = __copy_from_user_inatomic_nocache(vaddr, buf, copy); 1694 copied += copy; 1695 bytes -= copy; 1696 vaddr += copy; 1697 iov++; 1698 1699 if (unlikely(left)) 1700 break; 1701 } 1702 return copied - left; 1703} 1704 1705/* 1706 * Copy as much as we can into the page and return the number of bytes which 1707 * were sucessfully copied. If a fault is encountered then return the number of 1708 * bytes which were copied. 1709 */ 1710size_t iov_iter_copy_from_user_atomic(struct page *page, 1711 struct iov_iter *i, unsigned long offset, size_t bytes) 1712{ 1713 char *kaddr; 1714 size_t copied; 1715 1716 BUG_ON(!in_atomic()); 1717 kaddr = kmap_atomic(page, KM_USER0); 1718 if (likely(i->nr_segs == 1)) { 1719 int left; 1720 char __user *buf = i->iov->iov_base + i->iov_offset; 1721 left = __copy_from_user_inatomic_nocache(kaddr + offset, 1722 buf, bytes); 1723 copied = bytes - left; 1724 } else { 1725 copied = __iovec_copy_from_user_inatomic(kaddr + offset, 1726 i->iov, i->iov_offset, bytes); 1727 } 1728 kunmap_atomic(kaddr, KM_USER0); 1729 1730 return copied; 1731} 1732EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); 1733 1734/* 1735 * This has the same sideeffects and return value as 1736 * iov_iter_copy_from_user_atomic(). 1737 * The difference is that it attempts to resolve faults. 1738 * Page must not be locked. 1739 */ 1740size_t iov_iter_copy_from_user(struct page *page, 1741 struct iov_iter *i, unsigned long offset, size_t bytes) 1742{ 1743 char *kaddr; 1744 size_t copied; 1745 1746 kaddr = kmap(page); 1747 if (likely(i->nr_segs == 1)) { 1748 int left; 1749 char __user *buf = i->iov->iov_base + i->iov_offset; 1750 left = __copy_from_user_nocache(kaddr + offset, buf, bytes); 1751 copied = bytes - left; 1752 } else { 1753 copied = __iovec_copy_from_user_inatomic(kaddr + offset, 1754 i->iov, i->iov_offset, bytes); 1755 } 1756 kunmap(page); 1757 return copied; 1758} 1759EXPORT_SYMBOL(iov_iter_copy_from_user); 1760 1761void iov_iter_advance(struct iov_iter *i, size_t bytes) 1762{ 1763 BUG_ON(i->count < bytes); 1764 1765 if (likely(i->nr_segs == 1)) { 1766 i->iov_offset += bytes; 1767 i->count -= bytes; 1768 } else { 1769 const struct iovec *iov = i->iov; 1770 size_t base = i->iov_offset; 1771 1772 /* 1773 * The !iov->iov_len check ensures we skip over unlikely 1774 * zero-length segments (without overruning the iovec). 1775 */ 1776 while (bytes || unlikely(!iov->iov_len && i->count)) { 1777 int copy; 1778 1779 copy = min(bytes, iov->iov_len - base); 1780 BUG_ON(!i->count || i->count < copy); 1781 i->count -= copy; 1782 bytes -= copy; 1783 base += copy; 1784 if (iov->iov_len == base) { 1785 iov++; 1786 base = 0; 1787 } 1788 } 1789 i->iov = iov; 1790 i->iov_offset = base; 1791 } 1792} 1793EXPORT_SYMBOL(iov_iter_advance); 1794 1795/* 1796 * Fault in the first iovec of the given iov_iter, to a maximum length 1797 * of bytes. Returns 0 on success, or non-zero if the memory could not be 1798 * accessed (ie. because it is an invalid address). 1799 * 1800 * writev-intensive code may want this to prefault several iovecs -- that 1801 * would be possible (callers must not rely on the fact that _only_ the 1802 * first iovec will be faulted with the current implementation). 1803 */ 1804int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) 1805{ 1806 char __user *buf = i->iov->iov_base + i->iov_offset; 1807 bytes = min(bytes, i->iov->iov_len - i->iov_offset); 1808 return fault_in_pages_readable(buf, bytes); 1809} 1810EXPORT_SYMBOL(iov_iter_fault_in_readable); 1811 1812/* 1813 * Return the count of just the current iov_iter segment. 1814 */ 1815size_t iov_iter_single_seg_count(struct iov_iter *i) 1816{ 1817 const struct iovec *iov = i->iov; 1818 if (i->nr_segs == 1) 1819 return i->count; 1820 else 1821 return min(i->count, iov->iov_len - i->iov_offset); 1822} 1823EXPORT_SYMBOL(iov_iter_single_seg_count); 1824 1825/* 1826 * Performs necessary checks before doing a write 1827 * 1828 * Can adjust writing position or amount of bytes to write. 1829 * Returns appropriate error code that caller should return or 1830 * zero in case that write should be allowed. 1831 */ 1832inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) 1833{ 1834 struct inode *inode = file->f_mapping->host; 1835 unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 1836 1837 if (unlikely(*pos < 0)) 1838 return -EINVAL; 1839 1840 if (!isblk) { 1841 /* FIXME: this is for backwards compatibility with 2.4 */ 1842 if (file->f_flags & O_APPEND) 1843 *pos = i_size_read(inode); 1844 1845 if (limit != RLIM_INFINITY) { 1846 if (*pos >= limit) { 1847 send_sig(SIGXFSZ, current, 0); 1848 return -EFBIG; 1849 } 1850 if (*count > limit - (typeof(limit))*pos) { 1851 *count = limit - (typeof(limit))*pos; 1852 } 1853 } 1854 } 1855 1856 /* 1857 * LFS rule 1858 */ 1859 if (unlikely(*pos + *count > MAX_NON_LFS && 1860 !(file->f_flags & O_LARGEFILE))) { 1861 if (*pos >= MAX_NON_LFS) { 1862 return -EFBIG; 1863 } 1864 if (*count > MAX_NON_LFS - (unsigned long)*pos) { 1865 *count = MAX_NON_LFS - (unsigned long)*pos; 1866 } 1867 } 1868 1869 /* 1870 * Are we about to exceed the fs block limit ? 1871 * 1872 * If we have written data it becomes a short write. If we have 1873 * exceeded without writing data we send a signal and return EFBIG. 1874 * Linus frestrict idea will clean these up nicely.. 1875 */ 1876 if (likely(!isblk)) { 1877 if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { 1878 if (*count || *pos > inode->i_sb->s_maxbytes) { 1879 return -EFBIG; 1880 } 1881 /* zero-length writes at ->s_maxbytes are OK */ 1882 } 1883 1884 if (unlikely(*pos + *count > inode->i_sb->s_maxbytes)) 1885 *count = inode->i_sb->s_maxbytes - *pos; 1886 } else { 1887#ifdef CONFIG_BLOCK 1888 loff_t isize; 1889 if (bdev_read_only(I_BDEV(inode))) 1890 return -EPERM; 1891 isize = i_size_read(inode); 1892 if (*pos >= isize) { 1893 if (*count || *pos > isize) 1894 return -ENOSPC; 1895 } 1896 1897 if (*pos + *count > isize) 1898 *count = isize - *pos; 1899#else 1900 return -EPERM; 1901#endif 1902 } 1903 return 0; 1904} 1905EXPORT_SYMBOL(generic_write_checks); 1906 1907int pagecache_write_begin(struct file *file, struct address_space *mapping, 1908 loff_t pos, unsigned len, unsigned flags, 1909 struct page **pagep, void **fsdata) 1910{ 1911 const struct address_space_operations *aops = mapping->a_ops; 1912 1913 if (aops->write_begin) { 1914 return aops->write_begin(file, mapping, pos, len, flags, 1915 pagep, fsdata); 1916 } else { 1917 int ret; 1918 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1919 unsigned offset = pos & (PAGE_CACHE_SIZE - 1); 1920 struct inode *inode = mapping->host; 1921 struct page *page; 1922again: 1923 page = __grab_cache_page(mapping, index); 1924 *pagep = page; 1925 if (!page) 1926 return -ENOMEM; 1927 1928 if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) { 1929 /* 1930 * There is no way to resolve a short write situation 1931 * for a !Uptodate page (except by double copying in 1932 * the caller done by generic_perform_write_2copy). 1933 * 1934 * Instead, we have to bring it uptodate here. 1935 */ 1936 ret = aops->readpage(file, page); 1937 page_cache_release(page); 1938 if (ret) { 1939 if (ret == AOP_TRUNCATED_PAGE) 1940 goto again; 1941 return ret; 1942 } 1943 goto again; 1944 } 1945 1946 ret = aops->prepare_write(file, page, offset, offset+len); 1947 if (ret) { 1948 unlock_page(page); 1949 page_cache_release(page); 1950 if (pos + len > inode->i_size) 1951 vmtruncate(inode, inode->i_size); 1952 } 1953 return ret; 1954 } 1955} 1956EXPORT_SYMBOL(pagecache_write_begin); 1957 1958int pagecache_write_end(struct file *file, struct address_space *mapping, 1959 loff_t pos, unsigned len, unsigned copied, 1960 struct page *page, void *fsdata) 1961{ 1962 const struct address_space_operations *aops = mapping->a_ops; 1963 int ret; 1964 1965 if (aops->write_end) { 1966 mark_page_accessed(page); 1967 ret = aops->write_end(file, mapping, pos, len, copied, 1968 page, fsdata); 1969 } else { 1970 unsigned offset = pos & (PAGE_CACHE_SIZE - 1); 1971 struct inode *inode = mapping->host; 1972 1973 flush_dcache_page(page); 1974 ret = aops->commit_write(file, page, offset, offset+len); 1975 unlock_page(page); 1976 mark_page_accessed(page); 1977 page_cache_release(page); 1978 1979 if (ret < 0) { 1980 if (pos + len > inode->i_size) 1981 vmtruncate(inode, inode->i_size); 1982 } else if (ret > 0) 1983 ret = min_t(size_t, copied, ret); 1984 else 1985 ret = copied; 1986 } 1987 1988 return ret; 1989} 1990EXPORT_SYMBOL(pagecache_write_end); 1991 1992ssize_t 1993generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, 1994 unsigned long *nr_segs, loff_t pos, loff_t *ppos, 1995 size_t count, size_t ocount) 1996{ 1997 struct file *file = iocb->ki_filp; 1998 struct address_space *mapping = file->f_mapping; 1999 struct inode *inode = mapping->host; 2000 ssize_t written; 2001 2002 if (count != ocount) 2003 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); 2004 2005 written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs); 2006 if (written > 0) { 2007 loff_t end = pos + written; 2008 if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { 2009 i_size_write(inode, end); 2010 mark_inode_dirty(inode); 2011 } 2012 *ppos = end; 2013 } 2014 2015 /* 2016 * Sync the fs metadata but not the minor inode changes and 2017 * of course not the data as we did direct DMA for the IO. 2018 * i_mutex is held, which protects generic_osync_inode() from 2019 * livelocking. AIO O_DIRECT ops attempt to sync metadata here. 2020 */ 2021 if ((written >= 0 || written == -EIOCBQUEUED) && 2022 ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2023 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); 2024 if (err < 0) 2025 written = err; 2026 } 2027 return written; 2028} 2029EXPORT_SYMBOL(generic_file_direct_write); 2030 2031/* 2032 * Find or create a page at the given pagecache position. Return the locked 2033 * page. This function is specifically for buffered writes. 2034 */ 2035struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index) 2036{ 2037 int status; 2038 struct page *page; 2039repeat: 2040 page = find_lock_page(mapping, index); 2041 if (likely(page)) 2042 return page; 2043 2044 page = page_cache_alloc(mapping); 2045 if (!page) 2046 return NULL; 2047 status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); 2048 if (unlikely(status)) { 2049 page_cache_release(page); 2050 if (status == -EEXIST) 2051 goto repeat; 2052 return NULL; 2053 } 2054 return page; 2055} 2056EXPORT_SYMBOL(__grab_cache_page); 2057 2058static ssize_t generic_perform_write_2copy(struct file *file, 2059 struct iov_iter *i, loff_t pos) 2060{ 2061 struct address_space *mapping = file->f_mapping; 2062 const struct address_space_operations *a_ops = mapping->a_ops; 2063 struct inode *inode = mapping->host; 2064 long status = 0; 2065 ssize_t written = 0; 2066 2067 do { 2068 struct page *src_page; 2069 struct page *page; 2070 pgoff_t index; /* Pagecache index for current page */ 2071 unsigned long offset; /* Offset into pagecache page */ 2072 unsigned long bytes; /* Bytes to write to page */ 2073 size_t copied; /* Bytes copied from user */ 2074 2075 offset = (pos & (PAGE_CACHE_SIZE - 1)); 2076 index = pos >> PAGE_CACHE_SHIFT; 2077 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, 2078 iov_iter_count(i)); 2079 2080 /* 2081 * a non-NULL src_page indicates that we're doing the 2082 * copy via get_user_pages and kmap. 2083 */ 2084 src_page = NULL; 2085 2086 /* 2087 * Bring in the user page that we will copy from _first_. 2088 * Otherwise there's a nasty deadlock on copying from the 2089 * same page as we're writing to, without it being marked 2090 * up-to-date. 2091 * 2092 * Not only is this an optimisation, but it is also required 2093 * to check that the address is actually valid, when atomic 2094 * usercopies are used, below. 2095 */ 2096 if (unlikely(iov_iter_fault_in_readable(i, bytes))) { 2097 status = -EFAULT; 2098 break; 2099 } 2100 2101 page = __grab_cache_page(mapping, index); 2102 if (!page) { 2103 status = -ENOMEM; 2104 break; 2105 } 2106 2107 /* 2108 * non-uptodate pages cannot cope with short copies, and we 2109 * cannot take a pagefault with the destination page locked. 2110 * So pin the source page to copy it. 2111 */ 2112 if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) { 2113 unlock_page(page); 2114 2115 src_page = alloc_page(GFP_KERNEL); 2116 if (!src_page) { 2117 page_cache_release(page); 2118 status = -ENOMEM; 2119 break; 2120 } 2121 2122 /* 2123 * Cannot get_user_pages with a page locked for the 2124 * same reason as we can't take a page fault with a 2125 * page locked (as explained below). 2126 */ 2127 copied = iov_iter_copy_from_user(src_page, i, 2128 offset, bytes); 2129 if (unlikely(copied == 0)) { 2130 status = -EFAULT; 2131 page_cache_release(page); 2132 page_cache_release(src_page); 2133 break; 2134 } 2135 bytes = copied; 2136 2137 lock_page(page); 2138 /* 2139 * Can't handle the page going uptodate here, because 2140 * that means we would use non-atomic usercopies, which 2141 * zero out the tail of the page, which can cause 2142 * zeroes to become transiently visible. We could just 2143 * use a non-zeroing copy, but the APIs aren't too 2144 * consistent. 2145 */ 2146 if (unlikely(!page->mapping || PageUptodate(page))) { 2147 unlock_page(page); 2148 page_cache_release(page); 2149 page_cache_release(src_page); 2150 continue; 2151 } 2152 } 2153 2154 status = a_ops->prepare_write(file, page, offset, offset+bytes); 2155 if (unlikely(status)) 2156 goto fs_write_aop_error; 2157 2158 if (!src_page) { 2159 /* 2160 * Must not enter the pagefault handler here, because 2161 * we hold the page lock, so we might recursively 2162 * deadlock on the same lock, or get an ABBA deadlock 2163 * against a different lock, or against the mmap_sem 2164 * (which nests outside the page lock). So increment 2165 * preempt count, and use _atomic usercopies. 2166 * 2167 * The page is uptodate so we are OK to encounter a 2168 * short copy: if unmodified parts of the page are 2169 * marked dirty and written out to disk, it doesn't 2170 * really matter. 2171 */ 2172 pagefault_disable(); 2173 copied = iov_iter_copy_from_user_atomic(page, i, 2174 offset, bytes); 2175 pagefault_enable(); 2176 } else { 2177 void *src, *dst; 2178 src = kmap_atomic(src_page, KM_USER0); 2179 dst = kmap_atomic(page, KM_USER1); 2180 memcpy(dst + offset, src + offset, bytes); 2181 kunmap_atomic(dst, KM_USER1); 2182 kunmap_atomic(src, KM_USER0); 2183 copied = bytes; 2184 } 2185 flush_dcache_page(page); 2186 2187 status = a_ops->commit_write(file, page, offset, offset+bytes); 2188 if (unlikely(status < 0)) 2189 goto fs_write_aop_error; 2190 if (unlikely(status > 0)) /* filesystem did partial write */ 2191 copied = min_t(size_t, copied, status); 2192 2193 unlock_page(page); 2194 mark_page_accessed(page); 2195 page_cache_release(page); 2196 if (src_page) 2197 page_cache_release(src_page); 2198 2199 iov_iter_advance(i, copied); 2200 pos += copied; 2201 written += copied; 2202 2203 balance_dirty_pages_ratelimited(mapping); 2204 cond_resched(); 2205 continue; 2206 2207fs_write_aop_error: 2208 unlock_page(page); 2209 page_cache_release(page); 2210 if (src_page) 2211 page_cache_release(src_page); 2212 2213 /* 2214 * prepare_write() may have instantiated a few blocks 2215 * outside i_size. Trim these off again. Don't need 2216 * i_size_read because we hold i_mutex. 2217 */ 2218 if (pos + bytes > inode->i_size) 2219 vmtruncate(inode, inode->i_size); 2220 break; 2221 } while (iov_iter_count(i)); 2222 2223 return written ? written : status; 2224} 2225 2226static ssize_t generic_perform_write(struct file *file, 2227 struct iov_iter *i, loff_t pos) 2228{ 2229 struct address_space *mapping = file->f_mapping; 2230 const struct address_space_operations *a_ops = mapping->a_ops; 2231 long status = 0; 2232 ssize_t written = 0; 2233 unsigned int flags = 0; 2234 2235 /* 2236 * Copies from kernel address space cannot fail (NFSD is a big user). 2237 */ 2238 if (segment_eq(get_fs(), KERNEL_DS)) 2239 flags |= AOP_FLAG_UNINTERRUPTIBLE; 2240 2241 do { 2242 struct page *page; 2243 pgoff_t index; /* Pagecache index for current page */ 2244 unsigned long offset; /* Offset into pagecache page */ 2245 unsigned long bytes; /* Bytes to write to page */ 2246 size_t copied; /* Bytes copied from user */ 2247 void *fsdata; 2248 2249 offset = (pos & (PAGE_CACHE_SIZE - 1)); 2250 index = pos >> PAGE_CACHE_SHIFT; 2251 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, 2252 iov_iter_count(i)); 2253 2254again: 2255 2256 /* 2257 * Bring in the user page that we will copy from _first_. 2258 * Otherwise there's a nasty deadlock on copying from the 2259 * same page as we're writing to, without it being marked 2260 * up-to-date. 2261 * 2262 * Not only is this an optimisation, but it is also required 2263 * to check that the address is actually valid, when atomic 2264 * usercopies are used, below. 2265 */ 2266 if (unlikely(iov_iter_fault_in_readable(i, bytes))) { 2267 status = -EFAULT; 2268 break; 2269 } 2270 2271 status = a_ops->write_begin(file, mapping, pos, bytes, flags, 2272 &page, &fsdata); 2273 if (unlikely(status)) 2274 break; 2275 2276 pagefault_disable(); 2277 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 2278 pagefault_enable(); 2279 flush_dcache_page(page); 2280 2281 status = a_ops->write_end(file, mapping, pos, bytes, copied, 2282 page, fsdata); 2283 if (unlikely(status < 0)) 2284 break; 2285 copied = status; 2286 2287 cond_resched(); 2288 2289 iov_iter_advance(i, copied); 2290 if (unlikely(copied == 0)) { 2291 /* 2292 * If we were unable to copy any data at all, we must 2293 * fall back to a single segment length write. 2294 * 2295 * If we didn't fallback here, we could livelock 2296 * because not all segments in the iov can be copied at 2297 * once without a pagefault. 2298 */ 2299 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, 2300 iov_iter_single_seg_count(i)); 2301 goto again; 2302 } 2303 pos += copied; 2304 written += copied; 2305 2306 balance_dirty_pages_ratelimited(mapping); 2307 2308 } while (iov_iter_count(i)); 2309 2310 return written ? written : status; 2311} 2312 2313ssize_t 2314generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, 2315 unsigned long nr_segs, loff_t pos, loff_t *ppos, 2316 size_t count, ssize_t written) 2317{ 2318 struct file *file = iocb->ki_filp; 2319 struct address_space *mapping = file->f_mapping; 2320 const struct address_space_operations *a_ops = mapping->a_ops; 2321 struct inode *inode = mapping->host; 2322 ssize_t status; 2323 struct iov_iter i; 2324 2325 iov_iter_init(&i, iov, nr_segs, count, written); 2326 if (a_ops->write_begin) 2327 status = generic_perform_write(file, &i, pos); 2328 else 2329 status = generic_perform_write_2copy(file, &i, pos); 2330 2331 if (likely(status >= 0)) { 2332 written += status; 2333 *ppos = pos + status; 2334 2335 /* 2336 * For now, when the user asks for O_SYNC, we'll actually give 2337 * O_DSYNC 2338 */ 2339 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2340 if (!a_ops->writepage || !is_sync_kiocb(iocb)) 2341 status = generic_osync_inode(inode, mapping, 2342 OSYNC_METADATA|OSYNC_DATA); 2343 } 2344 } 2345 2346 /* 2347 * If we get here for O_DIRECT writes then we must have fallen through 2348 * to buffered writes (block instantiation inside i_size). So we sync 2349 * the file data here, to try to honour O_DIRECT expectations. 2350 */ 2351 if (unlikely(file->f_flags & O_DIRECT) && written) 2352 status = filemap_write_and_wait(mapping); 2353 2354 return written ? written : status; 2355} 2356EXPORT_SYMBOL(generic_file_buffered_write); 2357 2358static ssize_t 2359__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, 2360 unsigned long nr_segs, loff_t *ppos) 2361{ 2362 struct file *file = iocb->ki_filp; 2363 struct address_space * mapping = file->f_mapping; 2364 size_t ocount; /* original count */ 2365 size_t count; /* after file limit checks */ 2366 struct inode *inode = mapping->host; 2367 loff_t pos; 2368 ssize_t written; 2369 ssize_t err; 2370 2371 ocount = 0; 2372 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); 2373 if (err) 2374 return err; 2375 2376 count = ocount; 2377 pos = *ppos; 2378 2379 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 2380 2381 /* We can write back this queue in page reclaim */ 2382 current->backing_dev_info = mapping->backing_dev_info; 2383 written = 0; 2384 2385 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 2386 if (err) 2387 goto out; 2388 2389 if (count == 0) 2390 goto out; 2391 2392 err = remove_suid(file->f_path.dentry); 2393 if (err) 2394 goto out; 2395 2396 file_update_time(file); 2397 2398 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 2399 if (unlikely(file->f_flags & O_DIRECT)) { 2400 loff_t endbyte; 2401 ssize_t written_buffered; 2402 2403 written = generic_file_direct_write(iocb, iov, &nr_segs, pos, 2404 ppos, count, ocount); 2405 if (written < 0 || written == count) 2406 goto out; 2407 /* 2408 * direct-io write to a hole: fall through to buffered I/O 2409 * for completing the rest of the request. 2410 */ 2411 pos += written; 2412 count -= written; 2413 written_buffered = generic_file_buffered_write(iocb, iov, 2414 nr_segs, pos, ppos, count, 2415 written); 2416 /* 2417 * If generic_file_buffered_write() retuned a synchronous error 2418 * then we want to return the number of bytes which were 2419 * direct-written, or the error code if that was zero. Note 2420 * that this differs from normal direct-io semantics, which 2421 * will return -EFOO even if some bytes were written. 2422 */ 2423 if (written_buffered < 0) { 2424 err = written_buffered; 2425 goto out; 2426 } 2427 2428 /* 2429 * We need to ensure that the page cache pages are written to 2430 * disk and invalidated to preserve the expected O_DIRECT 2431 * semantics. 2432 */ 2433 endbyte = pos + written_buffered - written - 1; 2434 err = do_sync_mapping_range(file->f_mapping, pos, endbyte, 2435 SYNC_FILE_RANGE_WAIT_BEFORE| 2436 SYNC_FILE_RANGE_WRITE| 2437 SYNC_FILE_RANGE_WAIT_AFTER); 2438 if (err == 0) { 2439 written = written_buffered; 2440 invalidate_mapping_pages(mapping, 2441 pos >> PAGE_CACHE_SHIFT, 2442 endbyte >> PAGE_CACHE_SHIFT); 2443 } else { 2444 /* 2445 * We don't know how much we wrote, so just return 2446 * the number of bytes which were direct-written 2447 */ 2448 } 2449 } else { 2450 written = generic_file_buffered_write(iocb, iov, nr_segs, 2451 pos, ppos, count, written); 2452 } 2453out: 2454 current->backing_dev_info = NULL; 2455 return written ? written : err; 2456} 2457 2458ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, 2459 const struct iovec *iov, unsigned long nr_segs, loff_t pos) 2460{ 2461 struct file *file = iocb->ki_filp; 2462 struct address_space *mapping = file->f_mapping; 2463 struct inode *inode = mapping->host; 2464 ssize_t ret; 2465 2466 BUG_ON(iocb->ki_pos != pos); 2467 2468 ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, 2469 &iocb->ki_pos); 2470 2471 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2472 ssize_t err; 2473 2474 err = sync_page_range_nolock(inode, mapping, pos, ret); 2475 if (err < 0) 2476 ret = err; 2477 } 2478 return ret; 2479} 2480EXPORT_SYMBOL(generic_file_aio_write_nolock); 2481 2482ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 2483 unsigned long nr_segs, loff_t pos) 2484{ 2485 struct file *file = iocb->ki_filp; 2486 struct address_space *mapping = file->f_mapping; 2487 struct inode *inode = mapping->host; 2488 ssize_t ret; 2489 2490 BUG_ON(iocb->ki_pos != pos); 2491 2492 mutex_lock(&inode->i_mutex); 2493 ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, 2494 &iocb->ki_pos); 2495 mutex_unlock(&inode->i_mutex); 2496 2497 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2498 ssize_t err; 2499 2500 err = sync_page_range(inode, mapping, pos, ret); 2501 if (err < 0) 2502 ret = err; 2503 } 2504 return ret; 2505} 2506EXPORT_SYMBOL(generic_file_aio_write); 2507 2508/* 2509 * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something 2510 * went wrong during pagecache shootdown. 2511 */ 2512static ssize_t 2513generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 2514 loff_t offset, unsigned long nr_segs) 2515{ 2516 struct file *file = iocb->ki_filp; 2517 struct address_space *mapping = file->f_mapping; 2518 ssize_t retval; 2519 size_t write_len; 2520 pgoff_t end = 0; /* silence gcc */ 2521 2522 /* 2523 * If it's a write, unmap all mmappings of the file up-front. This 2524 * will cause any pte dirty bits to be propagated into the pageframes 2525 * for the subsequent filemap_write_and_wait(). 2526 */ 2527 if (rw == WRITE) { 2528 write_len = iov_length(iov, nr_segs); 2529 end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT; 2530 if (mapping_mapped(mapping)) 2531 unmap_mapping_range(mapping, offset, write_len, 0); 2532 } 2533 2534 retval = filemap_write_and_wait(mapping); 2535 if (retval) 2536 goto out; 2537 2538 /* 2539 * After a write we want buffered reads to be sure to go to disk to get 2540 * the new data. We invalidate clean cached page from the region we're 2541 * about to write. We do this *before* the write so that we can return 2542 * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). 2543 */ 2544 if (rw == WRITE && mapping->nrpages) { 2545 retval = invalidate_inode_pages2_range(mapping, 2546 offset >> PAGE_CACHE_SHIFT, end); 2547 if (retval) 2548 goto out; 2549 } 2550 2551 retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); 2552 2553 /* 2554 * Finally, try again to invalidate clean pages which might have been 2555 * cached by non-direct readahead, or faulted in by get_user_pages() 2556 * if the source of the write was an mmap'ed region of the file 2557 * we're writing. Either one is a pretty crazy thing to do, 2558 * so we don't support it 100%. If this invalidation 2559 * fails, tough, the write still worked... 2560 */ 2561 if (rw == WRITE && mapping->nrpages) { 2562 invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end); 2563 } 2564out: 2565 return retval; 2566} 2567 2568/** 2569 * try_to_release_page() - release old fs-specific metadata on a page 2570 * 2571 * @page: the page which the kernel is trying to free 2572 * @gfp_mask: memory allocation flags (and I/O mode) 2573 * 2574 * The address_space is to try to release any data against the page 2575 * (presumably at page->private). If the release was successful, return `1'. 2576 * Otherwise return zero. 2577 * 2578 * The @gfp_mask argument specifies whether I/O may be performed to release 2579 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT). 2580 * 2581 * NOTE: @gfp_mask may go away, and this function may become non-blocking. 2582 */ 2583int try_to_release_page(struct page *page, gfp_t gfp_mask) 2584{ 2585 struct address_space * const mapping = page->mapping; 2586 2587 BUG_ON(!PageLocked(page)); 2588 if (PageWriteback(page)) 2589 return 0; 2590 2591 if (mapping && mapping->a_ops->releasepage) 2592 return mapping->a_ops->releasepage(page, gfp_mask); 2593 return try_to_free_buffers(page); 2594} 2595 2596EXPORT_SYMBOL(try_to_release_page); 2597