shmem.c revision 40e041a2c858b3caefc757e26cb85bfceae5062b
1/* 2 * Resizable virtual memory filesystem for Linux. 3 * 4 * Copyright (C) 2000 Linus Torvalds. 5 * 2000 Transmeta Corp. 6 * 2000-2001 Christoph Rohland 7 * 2000-2001 SAP AG 8 * 2002 Red Hat Inc. 9 * Copyright (C) 2002-2011 Hugh Dickins. 10 * Copyright (C) 2011 Google Inc. 11 * Copyright (C) 2002-2005 VERITAS Software Corporation. 12 * Copyright (C) 2004 Andi Kleen, SuSE Labs 13 * 14 * Extended attribute support for tmpfs: 15 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> 16 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> 17 * 18 * tiny-shmem: 19 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> 20 * 21 * This file is released under the GPL. 22 */ 23 24#include <linux/fs.h> 25#include <linux/init.h> 26#include <linux/vfs.h> 27#include <linux/mount.h> 28#include <linux/ramfs.h> 29#include <linux/pagemap.h> 30#include <linux/file.h> 31#include <linux/mm.h> 32#include <linux/export.h> 33#include <linux/swap.h> 34#include <linux/aio.h> 35 36static struct vfsmount *shm_mnt; 37 38#ifdef CONFIG_SHMEM 39/* 40 * This virtual memory filesystem is heavily based on the ramfs. It 41 * extends ramfs by the ability to use swap and honor resource limits 42 * which makes it a completely usable filesystem. 43 */ 44 45#include <linux/xattr.h> 46#include <linux/exportfs.h> 47#include <linux/posix_acl.h> 48#include <linux/posix_acl_xattr.h> 49#include <linux/mman.h> 50#include <linux/string.h> 51#include <linux/slab.h> 52#include <linux/backing-dev.h> 53#include <linux/shmem_fs.h> 54#include <linux/writeback.h> 55#include <linux/blkdev.h> 56#include <linux/pagevec.h> 57#include <linux/percpu_counter.h> 58#include <linux/falloc.h> 59#include <linux/splice.h> 60#include <linux/security.h> 61#include <linux/swapops.h> 62#include <linux/mempolicy.h> 63#include <linux/namei.h> 64#include <linux/ctype.h> 65#include <linux/migrate.h> 66#include <linux/highmem.h> 67#include <linux/seq_file.h> 68#include <linux/magic.h> 69#include <linux/fcntl.h> 70 71#include <asm/uaccess.h> 72#include <asm/pgtable.h> 73 74#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) 75#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) 76 77/* Pretend that each entry is of this size in directory's i_size */ 78#define BOGO_DIRENT_SIZE 20 79 80/* Symlink up to this size is kmalloc'ed instead of using a swappable page */ 81#define SHORT_SYMLINK_LEN 128 82 83/* 84 * shmem_fallocate communicates with shmem_fault or shmem_writepage via 85 * inode->i_private (with i_mutex making sure that it has only one user at 86 * a time): we would prefer not to enlarge the shmem inode just for that. 87 */ 88struct shmem_falloc { 89 wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ 90 pgoff_t start; /* start of range currently being fallocated */ 91 pgoff_t next; /* the next page offset to be fallocated */ 92 pgoff_t nr_falloced; /* how many new pages have been fallocated */ 93 pgoff_t nr_unswapped; /* how often writepage refused to swap out */ 94}; 95 96/* Flag allocation requirements to shmem_getpage */ 97enum sgp_type { 98 SGP_READ, /* don't exceed i_size, don't allocate page */ 99 SGP_CACHE, /* don't exceed i_size, may allocate page */ 100 SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ 101 SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ 102 SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ 103}; 104 105#ifdef CONFIG_TMPFS 106static unsigned long shmem_default_max_blocks(void) 107{ 108 return totalram_pages / 2; 109} 110 111static unsigned long shmem_default_max_inodes(void) 112{ 113 return min(totalram_pages - totalhigh_pages, totalram_pages / 2); 114} 115#endif 116 117static bool shmem_should_replace_page(struct page *page, gfp_t gfp); 118static int shmem_replace_page(struct page **pagep, gfp_t gfp, 119 struct shmem_inode_info *info, pgoff_t index); 120static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 121 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); 122 123static inline int shmem_getpage(struct inode *inode, pgoff_t index, 124 struct page **pagep, enum sgp_type sgp, int *fault_type) 125{ 126 return shmem_getpage_gfp(inode, index, pagep, sgp, 127 mapping_gfp_mask(inode->i_mapping), fault_type); 128} 129 130static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) 131{ 132 return sb->s_fs_info; 133} 134 135/* 136 * shmem_file_setup pre-accounts the whole fixed size of a VM object, 137 * for shared memory and for shared anonymous (/dev/zero) mappings 138 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), 139 * consistent with the pre-accounting of private mappings ... 140 */ 141static inline int shmem_acct_size(unsigned long flags, loff_t size) 142{ 143 return (flags & VM_NORESERVE) ? 144 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); 145} 146 147static inline void shmem_unacct_size(unsigned long flags, loff_t size) 148{ 149 if (!(flags & VM_NORESERVE)) 150 vm_unacct_memory(VM_ACCT(size)); 151} 152 153static inline int shmem_reacct_size(unsigned long flags, 154 loff_t oldsize, loff_t newsize) 155{ 156 if (!(flags & VM_NORESERVE)) { 157 if (VM_ACCT(newsize) > VM_ACCT(oldsize)) 158 return security_vm_enough_memory_mm(current->mm, 159 VM_ACCT(newsize) - VM_ACCT(oldsize)); 160 else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) 161 vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); 162 } 163 return 0; 164} 165 166/* 167 * ... whereas tmpfs objects are accounted incrementally as 168 * pages are allocated, in order to allow huge sparse files. 169 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, 170 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. 171 */ 172static inline int shmem_acct_block(unsigned long flags) 173{ 174 return (flags & VM_NORESERVE) ? 175 security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0; 176} 177 178static inline void shmem_unacct_blocks(unsigned long flags, long pages) 179{ 180 if (flags & VM_NORESERVE) 181 vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE)); 182} 183 184static const struct super_operations shmem_ops; 185static const struct address_space_operations shmem_aops; 186static const struct file_operations shmem_file_operations; 187static const struct inode_operations shmem_inode_operations; 188static const struct inode_operations shmem_dir_inode_operations; 189static const struct inode_operations shmem_special_inode_operations; 190static const struct vm_operations_struct shmem_vm_ops; 191 192static struct backing_dev_info shmem_backing_dev_info __read_mostly = { 193 .ra_pages = 0, /* No readahead */ 194 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 195}; 196 197static LIST_HEAD(shmem_swaplist); 198static DEFINE_MUTEX(shmem_swaplist_mutex); 199 200static int shmem_reserve_inode(struct super_block *sb) 201{ 202 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 203 if (sbinfo->max_inodes) { 204 spin_lock(&sbinfo->stat_lock); 205 if (!sbinfo->free_inodes) { 206 spin_unlock(&sbinfo->stat_lock); 207 return -ENOSPC; 208 } 209 sbinfo->free_inodes--; 210 spin_unlock(&sbinfo->stat_lock); 211 } 212 return 0; 213} 214 215static void shmem_free_inode(struct super_block *sb) 216{ 217 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 218 if (sbinfo->max_inodes) { 219 spin_lock(&sbinfo->stat_lock); 220 sbinfo->free_inodes++; 221 spin_unlock(&sbinfo->stat_lock); 222 } 223} 224 225/** 226 * shmem_recalc_inode - recalculate the block usage of an inode 227 * @inode: inode to recalc 228 * 229 * We have to calculate the free blocks since the mm can drop 230 * undirtied hole pages behind our back. 231 * 232 * But normally info->alloced == inode->i_mapping->nrpages + info->swapped 233 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) 234 * 235 * It has to be called with the spinlock held. 236 */ 237static void shmem_recalc_inode(struct inode *inode) 238{ 239 struct shmem_inode_info *info = SHMEM_I(inode); 240 long freed; 241 242 freed = info->alloced - info->swapped - inode->i_mapping->nrpages; 243 if (freed > 0) { 244 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 245 if (sbinfo->max_blocks) 246 percpu_counter_add(&sbinfo->used_blocks, -freed); 247 info->alloced -= freed; 248 inode->i_blocks -= freed * BLOCKS_PER_PAGE; 249 shmem_unacct_blocks(info->flags, freed); 250 } 251} 252 253/* 254 * Replace item expected in radix tree by a new item, while holding tree lock. 255 */ 256static int shmem_radix_tree_replace(struct address_space *mapping, 257 pgoff_t index, void *expected, void *replacement) 258{ 259 void **pslot; 260 void *item; 261 262 VM_BUG_ON(!expected); 263 VM_BUG_ON(!replacement); 264 pslot = radix_tree_lookup_slot(&mapping->page_tree, index); 265 if (!pslot) 266 return -ENOENT; 267 item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock); 268 if (item != expected) 269 return -ENOENT; 270 radix_tree_replace_slot(pslot, replacement); 271 return 0; 272} 273 274/* 275 * Sometimes, before we decide whether to proceed or to fail, we must check 276 * that an entry was not already brought back from swap by a racing thread. 277 * 278 * Checking page is not enough: by the time a SwapCache page is locked, it 279 * might be reused, and again be SwapCache, using the same swap as before. 280 */ 281static bool shmem_confirm_swap(struct address_space *mapping, 282 pgoff_t index, swp_entry_t swap) 283{ 284 void *item; 285 286 rcu_read_lock(); 287 item = radix_tree_lookup(&mapping->page_tree, index); 288 rcu_read_unlock(); 289 return item == swp_to_radix_entry(swap); 290} 291 292/* 293 * Like add_to_page_cache_locked, but error if expected item has gone. 294 */ 295static int shmem_add_to_page_cache(struct page *page, 296 struct address_space *mapping, 297 pgoff_t index, void *expected) 298{ 299 int error; 300 301 VM_BUG_ON_PAGE(!PageLocked(page), page); 302 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 303 304 page_cache_get(page); 305 page->mapping = mapping; 306 page->index = index; 307 308 spin_lock_irq(&mapping->tree_lock); 309 if (!expected) 310 error = radix_tree_insert(&mapping->page_tree, index, page); 311 else 312 error = shmem_radix_tree_replace(mapping, index, expected, 313 page); 314 if (!error) { 315 mapping->nrpages++; 316 __inc_zone_page_state(page, NR_FILE_PAGES); 317 __inc_zone_page_state(page, NR_SHMEM); 318 spin_unlock_irq(&mapping->tree_lock); 319 } else { 320 page->mapping = NULL; 321 spin_unlock_irq(&mapping->tree_lock); 322 page_cache_release(page); 323 } 324 return error; 325} 326 327/* 328 * Like delete_from_page_cache, but substitutes swap for page. 329 */ 330static void shmem_delete_from_page_cache(struct page *page, void *radswap) 331{ 332 struct address_space *mapping = page->mapping; 333 int error; 334 335 spin_lock_irq(&mapping->tree_lock); 336 error = shmem_radix_tree_replace(mapping, page->index, page, radswap); 337 page->mapping = NULL; 338 mapping->nrpages--; 339 __dec_zone_page_state(page, NR_FILE_PAGES); 340 __dec_zone_page_state(page, NR_SHMEM); 341 spin_unlock_irq(&mapping->tree_lock); 342 page_cache_release(page); 343 BUG_ON(error); 344} 345 346/* 347 * Remove swap entry from radix tree, free the swap and its page cache. 348 */ 349static int shmem_free_swap(struct address_space *mapping, 350 pgoff_t index, void *radswap) 351{ 352 void *old; 353 354 spin_lock_irq(&mapping->tree_lock); 355 old = radix_tree_delete_item(&mapping->page_tree, index, radswap); 356 spin_unlock_irq(&mapping->tree_lock); 357 if (old != radswap) 358 return -ENOENT; 359 free_swap_and_cache(radix_to_swp_entry(radswap)); 360 return 0; 361} 362 363/* 364 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. 365 */ 366void shmem_unlock_mapping(struct address_space *mapping) 367{ 368 struct pagevec pvec; 369 pgoff_t indices[PAGEVEC_SIZE]; 370 pgoff_t index = 0; 371 372 pagevec_init(&pvec, 0); 373 /* 374 * Minor point, but we might as well stop if someone else SHM_LOCKs it. 375 */ 376 while (!mapping_unevictable(mapping)) { 377 /* 378 * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it 379 * has finished, if it hits a row of PAGEVEC_SIZE swap entries. 380 */ 381 pvec.nr = find_get_entries(mapping, index, 382 PAGEVEC_SIZE, pvec.pages, indices); 383 if (!pvec.nr) 384 break; 385 index = indices[pvec.nr - 1] + 1; 386 pagevec_remove_exceptionals(&pvec); 387 check_move_unevictable_pages(pvec.pages, pvec.nr); 388 pagevec_release(&pvec); 389 cond_resched(); 390 } 391} 392 393/* 394 * Remove range of pages and swap entries from radix tree, and free them. 395 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. 396 */ 397static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, 398 bool unfalloc) 399{ 400 struct address_space *mapping = inode->i_mapping; 401 struct shmem_inode_info *info = SHMEM_I(inode); 402 pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 403 pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT; 404 unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1); 405 unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); 406 struct pagevec pvec; 407 pgoff_t indices[PAGEVEC_SIZE]; 408 long nr_swaps_freed = 0; 409 pgoff_t index; 410 int i; 411 412 if (lend == -1) 413 end = -1; /* unsigned, so actually very big */ 414 415 pagevec_init(&pvec, 0); 416 index = start; 417 while (index < end) { 418 pvec.nr = find_get_entries(mapping, index, 419 min(end - index, (pgoff_t)PAGEVEC_SIZE), 420 pvec.pages, indices); 421 if (!pvec.nr) 422 break; 423 for (i = 0; i < pagevec_count(&pvec); i++) { 424 struct page *page = pvec.pages[i]; 425 426 index = indices[i]; 427 if (index >= end) 428 break; 429 430 if (radix_tree_exceptional_entry(page)) { 431 if (unfalloc) 432 continue; 433 nr_swaps_freed += !shmem_free_swap(mapping, 434 index, page); 435 continue; 436 } 437 438 if (!trylock_page(page)) 439 continue; 440 if (!unfalloc || !PageUptodate(page)) { 441 if (page->mapping == mapping) { 442 VM_BUG_ON_PAGE(PageWriteback(page), page); 443 truncate_inode_page(mapping, page); 444 } 445 } 446 unlock_page(page); 447 } 448 pagevec_remove_exceptionals(&pvec); 449 pagevec_release(&pvec); 450 cond_resched(); 451 index++; 452 } 453 454 if (partial_start) { 455 struct page *page = NULL; 456 shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); 457 if (page) { 458 unsigned int top = PAGE_CACHE_SIZE; 459 if (start > end) { 460 top = partial_end; 461 partial_end = 0; 462 } 463 zero_user_segment(page, partial_start, top); 464 set_page_dirty(page); 465 unlock_page(page); 466 page_cache_release(page); 467 } 468 } 469 if (partial_end) { 470 struct page *page = NULL; 471 shmem_getpage(inode, end, &page, SGP_READ, NULL); 472 if (page) { 473 zero_user_segment(page, 0, partial_end); 474 set_page_dirty(page); 475 unlock_page(page); 476 page_cache_release(page); 477 } 478 } 479 if (start >= end) 480 return; 481 482 index = start; 483 while (index < end) { 484 cond_resched(); 485 486 pvec.nr = find_get_entries(mapping, index, 487 min(end - index, (pgoff_t)PAGEVEC_SIZE), 488 pvec.pages, indices); 489 if (!pvec.nr) { 490 /* If all gone or hole-punch or unfalloc, we're done */ 491 if (index == start || end != -1) 492 break; 493 /* But if truncating, restart to make sure all gone */ 494 index = start; 495 continue; 496 } 497 for (i = 0; i < pagevec_count(&pvec); i++) { 498 struct page *page = pvec.pages[i]; 499 500 index = indices[i]; 501 if (index >= end) 502 break; 503 504 if (radix_tree_exceptional_entry(page)) { 505 if (unfalloc) 506 continue; 507 if (shmem_free_swap(mapping, index, page)) { 508 /* Swap was replaced by page: retry */ 509 index--; 510 break; 511 } 512 nr_swaps_freed++; 513 continue; 514 } 515 516 lock_page(page); 517 if (!unfalloc || !PageUptodate(page)) { 518 if (page->mapping == mapping) { 519 VM_BUG_ON_PAGE(PageWriteback(page), page); 520 truncate_inode_page(mapping, page); 521 } else { 522 /* Page was replaced by swap: retry */ 523 unlock_page(page); 524 index--; 525 break; 526 } 527 } 528 unlock_page(page); 529 } 530 pagevec_remove_exceptionals(&pvec); 531 pagevec_release(&pvec); 532 index++; 533 } 534 535 spin_lock(&info->lock); 536 info->swapped -= nr_swaps_freed; 537 shmem_recalc_inode(inode); 538 spin_unlock(&info->lock); 539} 540 541void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 542{ 543 shmem_undo_range(inode, lstart, lend, false); 544 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 545} 546EXPORT_SYMBOL_GPL(shmem_truncate_range); 547 548static int shmem_setattr(struct dentry *dentry, struct iattr *attr) 549{ 550 struct inode *inode = dentry->d_inode; 551 struct shmem_inode_info *info = SHMEM_I(inode); 552 int error; 553 554 error = inode_change_ok(inode, attr); 555 if (error) 556 return error; 557 558 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 559 loff_t oldsize = inode->i_size; 560 loff_t newsize = attr->ia_size; 561 562 /* protected by i_mutex */ 563 if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || 564 (newsize > oldsize && (info->seals & F_SEAL_GROW))) 565 return -EPERM; 566 567 if (newsize != oldsize) { 568 error = shmem_reacct_size(SHMEM_I(inode)->flags, 569 oldsize, newsize); 570 if (error) 571 return error; 572 i_size_write(inode, newsize); 573 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 574 } 575 if (newsize < oldsize) { 576 loff_t holebegin = round_up(newsize, PAGE_SIZE); 577 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); 578 shmem_truncate_range(inode, newsize, (loff_t)-1); 579 /* unmap again to remove racily COWed private pages */ 580 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); 581 } 582 } 583 584 setattr_copy(inode, attr); 585 if (attr->ia_valid & ATTR_MODE) 586 error = posix_acl_chmod(inode, inode->i_mode); 587 return error; 588} 589 590static void shmem_evict_inode(struct inode *inode) 591{ 592 struct shmem_inode_info *info = SHMEM_I(inode); 593 594 if (inode->i_mapping->a_ops == &shmem_aops) { 595 shmem_unacct_size(info->flags, inode->i_size); 596 inode->i_size = 0; 597 shmem_truncate_range(inode, 0, (loff_t)-1); 598 if (!list_empty(&info->swaplist)) { 599 mutex_lock(&shmem_swaplist_mutex); 600 list_del_init(&info->swaplist); 601 mutex_unlock(&shmem_swaplist_mutex); 602 } 603 } else 604 kfree(info->symlink); 605 606 simple_xattrs_free(&info->xattrs); 607 WARN_ON(inode->i_blocks); 608 shmem_free_inode(inode->i_sb); 609 clear_inode(inode); 610} 611 612/* 613 * If swap found in inode, free it and move page from swapcache to filecache. 614 */ 615static int shmem_unuse_inode(struct shmem_inode_info *info, 616 swp_entry_t swap, struct page **pagep) 617{ 618 struct address_space *mapping = info->vfs_inode.i_mapping; 619 void *radswap; 620 pgoff_t index; 621 gfp_t gfp; 622 int error = 0; 623 624 radswap = swp_to_radix_entry(swap); 625 index = radix_tree_locate_item(&mapping->page_tree, radswap); 626 if (index == -1) 627 return -EAGAIN; /* tell shmem_unuse we found nothing */ 628 629 /* 630 * Move _head_ to start search for next from here. 631 * But be careful: shmem_evict_inode checks list_empty without taking 632 * mutex, and there's an instant in list_move_tail when info->swaplist 633 * would appear empty, if it were the only one on shmem_swaplist. 634 */ 635 if (shmem_swaplist.next != &info->swaplist) 636 list_move_tail(&shmem_swaplist, &info->swaplist); 637 638 gfp = mapping_gfp_mask(mapping); 639 if (shmem_should_replace_page(*pagep, gfp)) { 640 mutex_unlock(&shmem_swaplist_mutex); 641 error = shmem_replace_page(pagep, gfp, info, index); 642 mutex_lock(&shmem_swaplist_mutex); 643 /* 644 * We needed to drop mutex to make that restrictive page 645 * allocation, but the inode might have been freed while we 646 * dropped it: although a racing shmem_evict_inode() cannot 647 * complete without emptying the radix_tree, our page lock 648 * on this swapcache page is not enough to prevent that - 649 * free_swap_and_cache() of our swap entry will only 650 * trylock_page(), removing swap from radix_tree whatever. 651 * 652 * We must not proceed to shmem_add_to_page_cache() if the 653 * inode has been freed, but of course we cannot rely on 654 * inode or mapping or info to check that. However, we can 655 * safely check if our swap entry is still in use (and here 656 * it can't have got reused for another page): if it's still 657 * in use, then the inode cannot have been freed yet, and we 658 * can safely proceed (if it's no longer in use, that tells 659 * nothing about the inode, but we don't need to unuse swap). 660 */ 661 if (!page_swapcount(*pagep)) 662 error = -ENOENT; 663 } 664 665 /* 666 * We rely on shmem_swaplist_mutex, not only to protect the swaplist, 667 * but also to hold up shmem_evict_inode(): so inode cannot be freed 668 * beneath us (pagelock doesn't help until the page is in pagecache). 669 */ 670 if (!error) 671 error = shmem_add_to_page_cache(*pagep, mapping, index, 672 radswap); 673 if (error != -ENOMEM) { 674 /* 675 * Truncation and eviction use free_swap_and_cache(), which 676 * only does trylock page: if we raced, best clean up here. 677 */ 678 delete_from_swap_cache(*pagep); 679 set_page_dirty(*pagep); 680 if (!error) { 681 spin_lock(&info->lock); 682 info->swapped--; 683 spin_unlock(&info->lock); 684 swap_free(swap); 685 } 686 } 687 return error; 688} 689 690/* 691 * Search through swapped inodes to find and replace swap by page. 692 */ 693int shmem_unuse(swp_entry_t swap, struct page *page) 694{ 695 struct list_head *this, *next; 696 struct shmem_inode_info *info; 697 struct mem_cgroup *memcg; 698 int error = 0; 699 700 /* 701 * There's a faint possibility that swap page was replaced before 702 * caller locked it: caller will come back later with the right page. 703 */ 704 if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) 705 goto out; 706 707 /* 708 * Charge page using GFP_KERNEL while we can wait, before taking 709 * the shmem_swaplist_mutex which might hold up shmem_writepage(). 710 * Charged back to the user (not to caller) when swap account is used. 711 */ 712 error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg); 713 if (error) 714 goto out; 715 /* No radix_tree_preload: swap entry keeps a place for page in tree */ 716 error = -EAGAIN; 717 718 mutex_lock(&shmem_swaplist_mutex); 719 list_for_each_safe(this, next, &shmem_swaplist) { 720 info = list_entry(this, struct shmem_inode_info, swaplist); 721 if (info->swapped) 722 error = shmem_unuse_inode(info, swap, &page); 723 else 724 list_del_init(&info->swaplist); 725 cond_resched(); 726 if (error != -EAGAIN) 727 break; 728 /* found nothing in this: move on to search the next */ 729 } 730 mutex_unlock(&shmem_swaplist_mutex); 731 732 if (error) { 733 if (error != -ENOMEM) 734 error = 0; 735 mem_cgroup_cancel_charge(page, memcg); 736 } else 737 mem_cgroup_commit_charge(page, memcg, true); 738out: 739 unlock_page(page); 740 page_cache_release(page); 741 return error; 742} 743 744/* 745 * Move the page from the page cache to the swap cache. 746 */ 747static int shmem_writepage(struct page *page, struct writeback_control *wbc) 748{ 749 struct shmem_inode_info *info; 750 struct address_space *mapping; 751 struct inode *inode; 752 swp_entry_t swap; 753 pgoff_t index; 754 755 BUG_ON(!PageLocked(page)); 756 mapping = page->mapping; 757 index = page->index; 758 inode = mapping->host; 759 info = SHMEM_I(inode); 760 if (info->flags & VM_LOCKED) 761 goto redirty; 762 if (!total_swap_pages) 763 goto redirty; 764 765 /* 766 * shmem_backing_dev_info's capabilities prevent regular writeback or 767 * sync from ever calling shmem_writepage; but a stacking filesystem 768 * might use ->writepage of its underlying filesystem, in which case 769 * tmpfs should write out to swap only in response to memory pressure, 770 * and not for the writeback threads or sync. 771 */ 772 if (!wbc->for_reclaim) { 773 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ 774 goto redirty; 775 } 776 777 /* 778 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC 779 * value into swapfile.c, the only way we can correctly account for a 780 * fallocated page arriving here is now to initialize it and write it. 781 * 782 * That's okay for a page already fallocated earlier, but if we have 783 * not yet completed the fallocation, then (a) we want to keep track 784 * of this page in case we have to undo it, and (b) it may not be a 785 * good idea to continue anyway, once we're pushing into swap. So 786 * reactivate the page, and let shmem_fallocate() quit when too many. 787 */ 788 if (!PageUptodate(page)) { 789 if (inode->i_private) { 790 struct shmem_falloc *shmem_falloc; 791 spin_lock(&inode->i_lock); 792 shmem_falloc = inode->i_private; 793 if (shmem_falloc && 794 !shmem_falloc->waitq && 795 index >= shmem_falloc->start && 796 index < shmem_falloc->next) 797 shmem_falloc->nr_unswapped++; 798 else 799 shmem_falloc = NULL; 800 spin_unlock(&inode->i_lock); 801 if (shmem_falloc) 802 goto redirty; 803 } 804 clear_highpage(page); 805 flush_dcache_page(page); 806 SetPageUptodate(page); 807 } 808 809 swap = get_swap_page(); 810 if (!swap.val) 811 goto redirty; 812 813 /* 814 * Add inode to shmem_unuse()'s list of swapped-out inodes, 815 * if it's not already there. Do it now before the page is 816 * moved to swap cache, when its pagelock no longer protects 817 * the inode from eviction. But don't unlock the mutex until 818 * we've incremented swapped, because shmem_unuse_inode() will 819 * prune a !swapped inode from the swaplist under this mutex. 820 */ 821 mutex_lock(&shmem_swaplist_mutex); 822 if (list_empty(&info->swaplist)) 823 list_add_tail(&info->swaplist, &shmem_swaplist); 824 825 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 826 swap_shmem_alloc(swap); 827 shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); 828 829 spin_lock(&info->lock); 830 info->swapped++; 831 shmem_recalc_inode(inode); 832 spin_unlock(&info->lock); 833 834 mutex_unlock(&shmem_swaplist_mutex); 835 BUG_ON(page_mapped(page)); 836 swap_writepage(page, wbc); 837 return 0; 838 } 839 840 mutex_unlock(&shmem_swaplist_mutex); 841 swapcache_free(swap); 842redirty: 843 set_page_dirty(page); 844 if (wbc->for_reclaim) 845 return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */ 846 unlock_page(page); 847 return 0; 848} 849 850#ifdef CONFIG_NUMA 851#ifdef CONFIG_TMPFS 852static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 853{ 854 char buffer[64]; 855 856 if (!mpol || mpol->mode == MPOL_DEFAULT) 857 return; /* show nothing */ 858 859 mpol_to_str(buffer, sizeof(buffer), mpol); 860 861 seq_printf(seq, ",mpol=%s", buffer); 862} 863 864static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 865{ 866 struct mempolicy *mpol = NULL; 867 if (sbinfo->mpol) { 868 spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ 869 mpol = sbinfo->mpol; 870 mpol_get(mpol); 871 spin_unlock(&sbinfo->stat_lock); 872 } 873 return mpol; 874} 875#endif /* CONFIG_TMPFS */ 876 877static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, 878 struct shmem_inode_info *info, pgoff_t index) 879{ 880 struct vm_area_struct pvma; 881 struct page *page; 882 883 /* Create a pseudo vma that just contains the policy */ 884 pvma.vm_start = 0; 885 /* Bias interleave by inode number to distribute better across nodes */ 886 pvma.vm_pgoff = index + info->vfs_inode.i_ino; 887 pvma.vm_ops = NULL; 888 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); 889 890 page = swapin_readahead(swap, gfp, &pvma, 0); 891 892 /* Drop reference taken by mpol_shared_policy_lookup() */ 893 mpol_cond_put(pvma.vm_policy); 894 895 return page; 896} 897 898static struct page *shmem_alloc_page(gfp_t gfp, 899 struct shmem_inode_info *info, pgoff_t index) 900{ 901 struct vm_area_struct pvma; 902 struct page *page; 903 904 /* Create a pseudo vma that just contains the policy */ 905 pvma.vm_start = 0; 906 /* Bias interleave by inode number to distribute better across nodes */ 907 pvma.vm_pgoff = index + info->vfs_inode.i_ino; 908 pvma.vm_ops = NULL; 909 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); 910 911 page = alloc_page_vma(gfp, &pvma, 0); 912 913 /* Drop reference taken by mpol_shared_policy_lookup() */ 914 mpol_cond_put(pvma.vm_policy); 915 916 return page; 917} 918#else /* !CONFIG_NUMA */ 919#ifdef CONFIG_TMPFS 920static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 921{ 922} 923#endif /* CONFIG_TMPFS */ 924 925static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, 926 struct shmem_inode_info *info, pgoff_t index) 927{ 928 return swapin_readahead(swap, gfp, NULL, 0); 929} 930 931static inline struct page *shmem_alloc_page(gfp_t gfp, 932 struct shmem_inode_info *info, pgoff_t index) 933{ 934 return alloc_page(gfp); 935} 936#endif /* CONFIG_NUMA */ 937 938#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS) 939static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 940{ 941 return NULL; 942} 943#endif 944 945/* 946 * When a page is moved from swapcache to shmem filecache (either by the 947 * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of 948 * shmem_unuse_inode()), it may have been read in earlier from swap, in 949 * ignorance of the mapping it belongs to. If that mapping has special 950 * constraints (like the gma500 GEM driver, which requires RAM below 4GB), 951 * we may need to copy to a suitable page before moving to filecache. 952 * 953 * In a future release, this may well be extended to respect cpuset and 954 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); 955 * but for now it is a simple matter of zone. 956 */ 957static bool shmem_should_replace_page(struct page *page, gfp_t gfp) 958{ 959 return page_zonenum(page) > gfp_zone(gfp); 960} 961 962static int shmem_replace_page(struct page **pagep, gfp_t gfp, 963 struct shmem_inode_info *info, pgoff_t index) 964{ 965 struct page *oldpage, *newpage; 966 struct address_space *swap_mapping; 967 pgoff_t swap_index; 968 int error; 969 970 oldpage = *pagep; 971 swap_index = page_private(oldpage); 972 swap_mapping = page_mapping(oldpage); 973 974 /* 975 * We have arrived here because our zones are constrained, so don't 976 * limit chance of success by further cpuset and node constraints. 977 */ 978 gfp &= ~GFP_CONSTRAINT_MASK; 979 newpage = shmem_alloc_page(gfp, info, index); 980 if (!newpage) 981 return -ENOMEM; 982 983 page_cache_get(newpage); 984 copy_highpage(newpage, oldpage); 985 flush_dcache_page(newpage); 986 987 __set_page_locked(newpage); 988 SetPageUptodate(newpage); 989 SetPageSwapBacked(newpage); 990 set_page_private(newpage, swap_index); 991 SetPageSwapCache(newpage); 992 993 /* 994 * Our caller will very soon move newpage out of swapcache, but it's 995 * a nice clean interface for us to replace oldpage by newpage there. 996 */ 997 spin_lock_irq(&swap_mapping->tree_lock); 998 error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, 999 newpage); 1000 if (!error) { 1001 __inc_zone_page_state(newpage, NR_FILE_PAGES); 1002 __dec_zone_page_state(oldpage, NR_FILE_PAGES); 1003 } 1004 spin_unlock_irq(&swap_mapping->tree_lock); 1005 1006 if (unlikely(error)) { 1007 /* 1008 * Is this possible? I think not, now that our callers check 1009 * both PageSwapCache and page_private after getting page lock; 1010 * but be defensive. Reverse old to newpage for clear and free. 1011 */ 1012 oldpage = newpage; 1013 } else { 1014 mem_cgroup_migrate(oldpage, newpage, false); 1015 lru_cache_add_anon(newpage); 1016 *pagep = newpage; 1017 } 1018 1019 ClearPageSwapCache(oldpage); 1020 set_page_private(oldpage, 0); 1021 1022 unlock_page(oldpage); 1023 page_cache_release(oldpage); 1024 page_cache_release(oldpage); 1025 return error; 1026} 1027 1028/* 1029 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate 1030 * 1031 * If we allocate a new one we do not mark it dirty. That's up to the 1032 * vm. If we swap it in we mark it dirty since we also free the swap 1033 * entry since a page cannot live in both the swap and page cache 1034 */ 1035static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 1036 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) 1037{ 1038 struct address_space *mapping = inode->i_mapping; 1039 struct shmem_inode_info *info; 1040 struct shmem_sb_info *sbinfo; 1041 struct mem_cgroup *memcg; 1042 struct page *page; 1043 swp_entry_t swap; 1044 int error; 1045 int once = 0; 1046 int alloced = 0; 1047 1048 if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) 1049 return -EFBIG; 1050repeat: 1051 swap.val = 0; 1052 page = find_lock_entry(mapping, index); 1053 if (radix_tree_exceptional_entry(page)) { 1054 swap = radix_to_swp_entry(page); 1055 page = NULL; 1056 } 1057 1058 if (sgp != SGP_WRITE && sgp != SGP_FALLOC && 1059 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { 1060 error = -EINVAL; 1061 goto failed; 1062 } 1063 1064 if (page && sgp == SGP_WRITE) 1065 mark_page_accessed(page); 1066 1067 /* fallocated page? */ 1068 if (page && !PageUptodate(page)) { 1069 if (sgp != SGP_READ) 1070 goto clear; 1071 unlock_page(page); 1072 page_cache_release(page); 1073 page = NULL; 1074 } 1075 if (page || (sgp == SGP_READ && !swap.val)) { 1076 *pagep = page; 1077 return 0; 1078 } 1079 1080 /* 1081 * Fast cache lookup did not find it: 1082 * bring it back from swap or allocate. 1083 */ 1084 info = SHMEM_I(inode); 1085 sbinfo = SHMEM_SB(inode->i_sb); 1086 1087 if (swap.val) { 1088 /* Look it up and read it in.. */ 1089 page = lookup_swap_cache(swap); 1090 if (!page) { 1091 /* here we actually do the io */ 1092 if (fault_type) 1093 *fault_type |= VM_FAULT_MAJOR; 1094 page = shmem_swapin(swap, gfp, info, index); 1095 if (!page) { 1096 error = -ENOMEM; 1097 goto failed; 1098 } 1099 } 1100 1101 /* We have to do this with page locked to prevent races */ 1102 lock_page(page); 1103 if (!PageSwapCache(page) || page_private(page) != swap.val || 1104 !shmem_confirm_swap(mapping, index, swap)) { 1105 error = -EEXIST; /* try again */ 1106 goto unlock; 1107 } 1108 if (!PageUptodate(page)) { 1109 error = -EIO; 1110 goto failed; 1111 } 1112 wait_on_page_writeback(page); 1113 1114 if (shmem_should_replace_page(page, gfp)) { 1115 error = shmem_replace_page(&page, gfp, info, index); 1116 if (error) 1117 goto failed; 1118 } 1119 1120 error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); 1121 if (!error) { 1122 error = shmem_add_to_page_cache(page, mapping, index, 1123 swp_to_radix_entry(swap)); 1124 /* 1125 * We already confirmed swap under page lock, and make 1126 * no memory allocation here, so usually no possibility 1127 * of error; but free_swap_and_cache() only trylocks a 1128 * page, so it is just possible that the entry has been 1129 * truncated or holepunched since swap was confirmed. 1130 * shmem_undo_range() will have done some of the 1131 * unaccounting, now delete_from_swap_cache() will do 1132 * the rest (including mem_cgroup_uncharge_swapcache). 1133 * Reset swap.val? No, leave it so "failed" goes back to 1134 * "repeat": reading a hole and writing should succeed. 1135 */ 1136 if (error) { 1137 mem_cgroup_cancel_charge(page, memcg); 1138 delete_from_swap_cache(page); 1139 } 1140 } 1141 if (error) 1142 goto failed; 1143 1144 mem_cgroup_commit_charge(page, memcg, true); 1145 1146 spin_lock(&info->lock); 1147 info->swapped--; 1148 shmem_recalc_inode(inode); 1149 spin_unlock(&info->lock); 1150 1151 if (sgp == SGP_WRITE) 1152 mark_page_accessed(page); 1153 1154 delete_from_swap_cache(page); 1155 set_page_dirty(page); 1156 swap_free(swap); 1157 1158 } else { 1159 if (shmem_acct_block(info->flags)) { 1160 error = -ENOSPC; 1161 goto failed; 1162 } 1163 if (sbinfo->max_blocks) { 1164 if (percpu_counter_compare(&sbinfo->used_blocks, 1165 sbinfo->max_blocks) >= 0) { 1166 error = -ENOSPC; 1167 goto unacct; 1168 } 1169 percpu_counter_inc(&sbinfo->used_blocks); 1170 } 1171 1172 page = shmem_alloc_page(gfp, info, index); 1173 if (!page) { 1174 error = -ENOMEM; 1175 goto decused; 1176 } 1177 1178 __SetPageSwapBacked(page); 1179 __set_page_locked(page); 1180 if (sgp == SGP_WRITE) 1181 __SetPageReferenced(page); 1182 1183 error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); 1184 if (error) 1185 goto decused; 1186 error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); 1187 if (!error) { 1188 error = shmem_add_to_page_cache(page, mapping, index, 1189 NULL); 1190 radix_tree_preload_end(); 1191 } 1192 if (error) { 1193 mem_cgroup_cancel_charge(page, memcg); 1194 goto decused; 1195 } 1196 mem_cgroup_commit_charge(page, memcg, false); 1197 lru_cache_add_anon(page); 1198 1199 spin_lock(&info->lock); 1200 info->alloced++; 1201 inode->i_blocks += BLOCKS_PER_PAGE; 1202 shmem_recalc_inode(inode); 1203 spin_unlock(&info->lock); 1204 alloced = true; 1205 1206 /* 1207 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. 1208 */ 1209 if (sgp == SGP_FALLOC) 1210 sgp = SGP_WRITE; 1211clear: 1212 /* 1213 * Let SGP_WRITE caller clear ends if write does not fill page; 1214 * but SGP_FALLOC on a page fallocated earlier must initialize 1215 * it now, lest undo on failure cancel our earlier guarantee. 1216 */ 1217 if (sgp != SGP_WRITE) { 1218 clear_highpage(page); 1219 flush_dcache_page(page); 1220 SetPageUptodate(page); 1221 } 1222 if (sgp == SGP_DIRTY) 1223 set_page_dirty(page); 1224 } 1225 1226 /* Perhaps the file has been truncated since we checked */ 1227 if (sgp != SGP_WRITE && sgp != SGP_FALLOC && 1228 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { 1229 error = -EINVAL; 1230 if (alloced) 1231 goto trunc; 1232 else 1233 goto failed; 1234 } 1235 *pagep = page; 1236 return 0; 1237 1238 /* 1239 * Error recovery. 1240 */ 1241trunc: 1242 info = SHMEM_I(inode); 1243 ClearPageDirty(page); 1244 delete_from_page_cache(page); 1245 spin_lock(&info->lock); 1246 info->alloced--; 1247 inode->i_blocks -= BLOCKS_PER_PAGE; 1248 spin_unlock(&info->lock); 1249decused: 1250 sbinfo = SHMEM_SB(inode->i_sb); 1251 if (sbinfo->max_blocks) 1252 percpu_counter_add(&sbinfo->used_blocks, -1); 1253unacct: 1254 shmem_unacct_blocks(info->flags, 1); 1255failed: 1256 if (swap.val && error != -EINVAL && 1257 !shmem_confirm_swap(mapping, index, swap)) 1258 error = -EEXIST; 1259unlock: 1260 if (page) { 1261 unlock_page(page); 1262 page_cache_release(page); 1263 } 1264 if (error == -ENOSPC && !once++) { 1265 info = SHMEM_I(inode); 1266 spin_lock(&info->lock); 1267 shmem_recalc_inode(inode); 1268 spin_unlock(&info->lock); 1269 goto repeat; 1270 } 1271 if (error == -EEXIST) /* from above or from radix_tree_insert */ 1272 goto repeat; 1273 return error; 1274} 1275 1276static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1277{ 1278 struct inode *inode = file_inode(vma->vm_file); 1279 int error; 1280 int ret = VM_FAULT_LOCKED; 1281 1282 /* 1283 * Trinity finds that probing a hole which tmpfs is punching can 1284 * prevent the hole-punch from ever completing: which in turn 1285 * locks writers out with its hold on i_mutex. So refrain from 1286 * faulting pages into the hole while it's being punched. Although 1287 * shmem_undo_range() does remove the additions, it may be unable to 1288 * keep up, as each new page needs its own unmap_mapping_range() call, 1289 * and the i_mmap tree grows ever slower to scan if new vmas are added. 1290 * 1291 * It does not matter if we sometimes reach this check just before the 1292 * hole-punch begins, so that one fault then races with the punch: 1293 * we just need to make racing faults a rare case. 1294 * 1295 * The implementation below would be much simpler if we just used a 1296 * standard mutex or completion: but we cannot take i_mutex in fault, 1297 * and bloating every shmem inode for this unlikely case would be sad. 1298 */ 1299 if (unlikely(inode->i_private)) { 1300 struct shmem_falloc *shmem_falloc; 1301 1302 spin_lock(&inode->i_lock); 1303 shmem_falloc = inode->i_private; 1304 if (shmem_falloc && 1305 shmem_falloc->waitq && 1306 vmf->pgoff >= shmem_falloc->start && 1307 vmf->pgoff < shmem_falloc->next) { 1308 wait_queue_head_t *shmem_falloc_waitq; 1309 DEFINE_WAIT(shmem_fault_wait); 1310 1311 ret = VM_FAULT_NOPAGE; 1312 if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && 1313 !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { 1314 /* It's polite to up mmap_sem if we can */ 1315 up_read(&vma->vm_mm->mmap_sem); 1316 ret = VM_FAULT_RETRY; 1317 } 1318 1319 shmem_falloc_waitq = shmem_falloc->waitq; 1320 prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, 1321 TASK_UNINTERRUPTIBLE); 1322 spin_unlock(&inode->i_lock); 1323 schedule(); 1324 1325 /* 1326 * shmem_falloc_waitq points into the shmem_fallocate() 1327 * stack of the hole-punching task: shmem_falloc_waitq 1328 * is usually invalid by the time we reach here, but 1329 * finish_wait() does not dereference it in that case; 1330 * though i_lock needed lest racing with wake_up_all(). 1331 */ 1332 spin_lock(&inode->i_lock); 1333 finish_wait(shmem_falloc_waitq, &shmem_fault_wait); 1334 spin_unlock(&inode->i_lock); 1335 return ret; 1336 } 1337 spin_unlock(&inode->i_lock); 1338 } 1339 1340 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); 1341 if (error) 1342 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1343 1344 if (ret & VM_FAULT_MAJOR) { 1345 count_vm_event(PGMAJFAULT); 1346 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 1347 } 1348 return ret; 1349} 1350 1351#ifdef CONFIG_NUMA 1352static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) 1353{ 1354 struct inode *inode = file_inode(vma->vm_file); 1355 return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); 1356} 1357 1358static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, 1359 unsigned long addr) 1360{ 1361 struct inode *inode = file_inode(vma->vm_file); 1362 pgoff_t index; 1363 1364 index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 1365 return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); 1366} 1367#endif 1368 1369int shmem_lock(struct file *file, int lock, struct user_struct *user) 1370{ 1371 struct inode *inode = file_inode(file); 1372 struct shmem_inode_info *info = SHMEM_I(inode); 1373 int retval = -ENOMEM; 1374 1375 spin_lock(&info->lock); 1376 if (lock && !(info->flags & VM_LOCKED)) { 1377 if (!user_shm_lock(inode->i_size, user)) 1378 goto out_nomem; 1379 info->flags |= VM_LOCKED; 1380 mapping_set_unevictable(file->f_mapping); 1381 } 1382 if (!lock && (info->flags & VM_LOCKED) && user) { 1383 user_shm_unlock(inode->i_size, user); 1384 info->flags &= ~VM_LOCKED; 1385 mapping_clear_unevictable(file->f_mapping); 1386 } 1387 retval = 0; 1388 1389out_nomem: 1390 spin_unlock(&info->lock); 1391 return retval; 1392} 1393 1394static int shmem_mmap(struct file *file, struct vm_area_struct *vma) 1395{ 1396 file_accessed(file); 1397 vma->vm_ops = &shmem_vm_ops; 1398 return 0; 1399} 1400 1401static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, 1402 umode_t mode, dev_t dev, unsigned long flags) 1403{ 1404 struct inode *inode; 1405 struct shmem_inode_info *info; 1406 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 1407 1408 if (shmem_reserve_inode(sb)) 1409 return NULL; 1410 1411 inode = new_inode(sb); 1412 if (inode) { 1413 inode->i_ino = get_next_ino(); 1414 inode_init_owner(inode, dir, mode); 1415 inode->i_blocks = 0; 1416 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; 1417 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1418 inode->i_generation = get_seconds(); 1419 info = SHMEM_I(inode); 1420 memset(info, 0, (char *)inode - (char *)info); 1421 spin_lock_init(&info->lock); 1422 info->seals = F_SEAL_SEAL; 1423 info->flags = flags & VM_NORESERVE; 1424 INIT_LIST_HEAD(&info->swaplist); 1425 simple_xattrs_init(&info->xattrs); 1426 cache_no_acl(inode); 1427 1428 switch (mode & S_IFMT) { 1429 default: 1430 inode->i_op = &shmem_special_inode_operations; 1431 init_special_inode(inode, mode, dev); 1432 break; 1433 case S_IFREG: 1434 inode->i_mapping->a_ops = &shmem_aops; 1435 inode->i_op = &shmem_inode_operations; 1436 inode->i_fop = &shmem_file_operations; 1437 mpol_shared_policy_init(&info->policy, 1438 shmem_get_sbmpol(sbinfo)); 1439 break; 1440 case S_IFDIR: 1441 inc_nlink(inode); 1442 /* Some things misbehave if size == 0 on a directory */ 1443 inode->i_size = 2 * BOGO_DIRENT_SIZE; 1444 inode->i_op = &shmem_dir_inode_operations; 1445 inode->i_fop = &simple_dir_operations; 1446 break; 1447 case S_IFLNK: 1448 /* 1449 * Must not load anything in the rbtree, 1450 * mpol_free_shared_policy will not be called. 1451 */ 1452 mpol_shared_policy_init(&info->policy, NULL); 1453 break; 1454 } 1455 } else 1456 shmem_free_inode(sb); 1457 return inode; 1458} 1459 1460bool shmem_mapping(struct address_space *mapping) 1461{ 1462 return mapping->backing_dev_info == &shmem_backing_dev_info; 1463} 1464 1465#ifdef CONFIG_TMPFS 1466static const struct inode_operations shmem_symlink_inode_operations; 1467static const struct inode_operations shmem_short_symlink_operations; 1468 1469#ifdef CONFIG_TMPFS_XATTR 1470static int shmem_initxattrs(struct inode *, const struct xattr *, void *); 1471#else 1472#define shmem_initxattrs NULL 1473#endif 1474 1475static int 1476shmem_write_begin(struct file *file, struct address_space *mapping, 1477 loff_t pos, unsigned len, unsigned flags, 1478 struct page **pagep, void **fsdata) 1479{ 1480 struct inode *inode = mapping->host; 1481 struct shmem_inode_info *info = SHMEM_I(inode); 1482 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1483 1484 /* i_mutex is held by caller */ 1485 if (unlikely(info->seals)) { 1486 if (info->seals & F_SEAL_WRITE) 1487 return -EPERM; 1488 if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) 1489 return -EPERM; 1490 } 1491 1492 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); 1493} 1494 1495static int 1496shmem_write_end(struct file *file, struct address_space *mapping, 1497 loff_t pos, unsigned len, unsigned copied, 1498 struct page *page, void *fsdata) 1499{ 1500 struct inode *inode = mapping->host; 1501 1502 if (pos + copied > inode->i_size) 1503 i_size_write(inode, pos + copied); 1504 1505 if (!PageUptodate(page)) { 1506 if (copied < PAGE_CACHE_SIZE) { 1507 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 1508 zero_user_segments(page, 0, from, 1509 from + copied, PAGE_CACHE_SIZE); 1510 } 1511 SetPageUptodate(page); 1512 } 1513 set_page_dirty(page); 1514 unlock_page(page); 1515 page_cache_release(page); 1516 1517 return copied; 1518} 1519 1520static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 1521{ 1522 struct file *file = iocb->ki_filp; 1523 struct inode *inode = file_inode(file); 1524 struct address_space *mapping = inode->i_mapping; 1525 pgoff_t index; 1526 unsigned long offset; 1527 enum sgp_type sgp = SGP_READ; 1528 int error = 0; 1529 ssize_t retval = 0; 1530 loff_t *ppos = &iocb->ki_pos; 1531 1532 /* 1533 * Might this read be for a stacking filesystem? Then when reading 1534 * holes of a sparse file, we actually need to allocate those pages, 1535 * and even mark them dirty, so it cannot exceed the max_blocks limit. 1536 */ 1537 if (segment_eq(get_fs(), KERNEL_DS)) 1538 sgp = SGP_DIRTY; 1539 1540 index = *ppos >> PAGE_CACHE_SHIFT; 1541 offset = *ppos & ~PAGE_CACHE_MASK; 1542 1543 for (;;) { 1544 struct page *page = NULL; 1545 pgoff_t end_index; 1546 unsigned long nr, ret; 1547 loff_t i_size = i_size_read(inode); 1548 1549 end_index = i_size >> PAGE_CACHE_SHIFT; 1550 if (index > end_index) 1551 break; 1552 if (index == end_index) { 1553 nr = i_size & ~PAGE_CACHE_MASK; 1554 if (nr <= offset) 1555 break; 1556 } 1557 1558 error = shmem_getpage(inode, index, &page, sgp, NULL); 1559 if (error) { 1560 if (error == -EINVAL) 1561 error = 0; 1562 break; 1563 } 1564 if (page) 1565 unlock_page(page); 1566 1567 /* 1568 * We must evaluate after, since reads (unlike writes) 1569 * are called without i_mutex protection against truncate 1570 */ 1571 nr = PAGE_CACHE_SIZE; 1572 i_size = i_size_read(inode); 1573 end_index = i_size >> PAGE_CACHE_SHIFT; 1574 if (index == end_index) { 1575 nr = i_size & ~PAGE_CACHE_MASK; 1576 if (nr <= offset) { 1577 if (page) 1578 page_cache_release(page); 1579 break; 1580 } 1581 } 1582 nr -= offset; 1583 1584 if (page) { 1585 /* 1586 * If users can be writing to this page using arbitrary 1587 * virtual addresses, take care about potential aliasing 1588 * before reading the page on the kernel side. 1589 */ 1590 if (mapping_writably_mapped(mapping)) 1591 flush_dcache_page(page); 1592 /* 1593 * Mark the page accessed if we read the beginning. 1594 */ 1595 if (!offset) 1596 mark_page_accessed(page); 1597 } else { 1598 page = ZERO_PAGE(0); 1599 page_cache_get(page); 1600 } 1601 1602 /* 1603 * Ok, we have the page, and it's up-to-date, so 1604 * now we can copy it to user space... 1605 */ 1606 ret = copy_page_to_iter(page, offset, nr, to); 1607 retval += ret; 1608 offset += ret; 1609 index += offset >> PAGE_CACHE_SHIFT; 1610 offset &= ~PAGE_CACHE_MASK; 1611 1612 page_cache_release(page); 1613 if (!iov_iter_count(to)) 1614 break; 1615 if (ret < nr) { 1616 error = -EFAULT; 1617 break; 1618 } 1619 cond_resched(); 1620 } 1621 1622 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; 1623 file_accessed(file); 1624 return retval ? retval : error; 1625} 1626 1627static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, 1628 struct pipe_inode_info *pipe, size_t len, 1629 unsigned int flags) 1630{ 1631 struct address_space *mapping = in->f_mapping; 1632 struct inode *inode = mapping->host; 1633 unsigned int loff, nr_pages, req_pages; 1634 struct page *pages[PIPE_DEF_BUFFERS]; 1635 struct partial_page partial[PIPE_DEF_BUFFERS]; 1636 struct page *page; 1637 pgoff_t index, end_index; 1638 loff_t isize, left; 1639 int error, page_nr; 1640 struct splice_pipe_desc spd = { 1641 .pages = pages, 1642 .partial = partial, 1643 .nr_pages_max = PIPE_DEF_BUFFERS, 1644 .flags = flags, 1645 .ops = &page_cache_pipe_buf_ops, 1646 .spd_release = spd_release_page, 1647 }; 1648 1649 isize = i_size_read(inode); 1650 if (unlikely(*ppos >= isize)) 1651 return 0; 1652 1653 left = isize - *ppos; 1654 if (unlikely(left < len)) 1655 len = left; 1656 1657 if (splice_grow_spd(pipe, &spd)) 1658 return -ENOMEM; 1659 1660 index = *ppos >> PAGE_CACHE_SHIFT; 1661 loff = *ppos & ~PAGE_CACHE_MASK; 1662 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1663 nr_pages = min(req_pages, spd.nr_pages_max); 1664 1665 spd.nr_pages = find_get_pages_contig(mapping, index, 1666 nr_pages, spd.pages); 1667 index += spd.nr_pages; 1668 error = 0; 1669 1670 while (spd.nr_pages < nr_pages) { 1671 error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); 1672 if (error) 1673 break; 1674 unlock_page(page); 1675 spd.pages[spd.nr_pages++] = page; 1676 index++; 1677 } 1678 1679 index = *ppos >> PAGE_CACHE_SHIFT; 1680 nr_pages = spd.nr_pages; 1681 spd.nr_pages = 0; 1682 1683 for (page_nr = 0; page_nr < nr_pages; page_nr++) { 1684 unsigned int this_len; 1685 1686 if (!len) 1687 break; 1688 1689 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 1690 page = spd.pages[page_nr]; 1691 1692 if (!PageUptodate(page) || page->mapping != mapping) { 1693 error = shmem_getpage(inode, index, &page, 1694 SGP_CACHE, NULL); 1695 if (error) 1696 break; 1697 unlock_page(page); 1698 page_cache_release(spd.pages[page_nr]); 1699 spd.pages[page_nr] = page; 1700 } 1701 1702 isize = i_size_read(inode); 1703 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 1704 if (unlikely(!isize || index > end_index)) 1705 break; 1706 1707 if (end_index == index) { 1708 unsigned int plen; 1709 1710 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; 1711 if (plen <= loff) 1712 break; 1713 1714 this_len = min(this_len, plen - loff); 1715 len = this_len; 1716 } 1717 1718 spd.partial[page_nr].offset = loff; 1719 spd.partial[page_nr].len = this_len; 1720 len -= this_len; 1721 loff = 0; 1722 spd.nr_pages++; 1723 index++; 1724 } 1725 1726 while (page_nr < nr_pages) 1727 page_cache_release(spd.pages[page_nr++]); 1728 1729 if (spd.nr_pages) 1730 error = splice_to_pipe(pipe, &spd); 1731 1732 splice_shrink_spd(&spd); 1733 1734 if (error > 0) { 1735 *ppos += error; 1736 file_accessed(in); 1737 } 1738 return error; 1739} 1740 1741/* 1742 * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. 1743 */ 1744static pgoff_t shmem_seek_hole_data(struct address_space *mapping, 1745 pgoff_t index, pgoff_t end, int whence) 1746{ 1747 struct page *page; 1748 struct pagevec pvec; 1749 pgoff_t indices[PAGEVEC_SIZE]; 1750 bool done = false; 1751 int i; 1752 1753 pagevec_init(&pvec, 0); 1754 pvec.nr = 1; /* start small: we may be there already */ 1755 while (!done) { 1756 pvec.nr = find_get_entries(mapping, index, 1757 pvec.nr, pvec.pages, indices); 1758 if (!pvec.nr) { 1759 if (whence == SEEK_DATA) 1760 index = end; 1761 break; 1762 } 1763 for (i = 0; i < pvec.nr; i++, index++) { 1764 if (index < indices[i]) { 1765 if (whence == SEEK_HOLE) { 1766 done = true; 1767 break; 1768 } 1769 index = indices[i]; 1770 } 1771 page = pvec.pages[i]; 1772 if (page && !radix_tree_exceptional_entry(page)) { 1773 if (!PageUptodate(page)) 1774 page = NULL; 1775 } 1776 if (index >= end || 1777 (page && whence == SEEK_DATA) || 1778 (!page && whence == SEEK_HOLE)) { 1779 done = true; 1780 break; 1781 } 1782 } 1783 pagevec_remove_exceptionals(&pvec); 1784 pagevec_release(&pvec); 1785 pvec.nr = PAGEVEC_SIZE; 1786 cond_resched(); 1787 } 1788 return index; 1789} 1790 1791static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) 1792{ 1793 struct address_space *mapping = file->f_mapping; 1794 struct inode *inode = mapping->host; 1795 pgoff_t start, end; 1796 loff_t new_offset; 1797 1798 if (whence != SEEK_DATA && whence != SEEK_HOLE) 1799 return generic_file_llseek_size(file, offset, whence, 1800 MAX_LFS_FILESIZE, i_size_read(inode)); 1801 mutex_lock(&inode->i_mutex); 1802 /* We're holding i_mutex so we can access i_size directly */ 1803 1804 if (offset < 0) 1805 offset = -EINVAL; 1806 else if (offset >= inode->i_size) 1807 offset = -ENXIO; 1808 else { 1809 start = offset >> PAGE_CACHE_SHIFT; 1810 end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1811 new_offset = shmem_seek_hole_data(mapping, start, end, whence); 1812 new_offset <<= PAGE_CACHE_SHIFT; 1813 if (new_offset > offset) { 1814 if (new_offset < inode->i_size) 1815 offset = new_offset; 1816 else if (whence == SEEK_DATA) 1817 offset = -ENXIO; 1818 else 1819 offset = inode->i_size; 1820 } 1821 } 1822 1823 if (offset >= 0) 1824 offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); 1825 mutex_unlock(&inode->i_mutex); 1826 return offset; 1827} 1828 1829static int shmem_wait_for_pins(struct address_space *mapping) 1830{ 1831 return 0; 1832} 1833 1834#define F_ALL_SEALS (F_SEAL_SEAL | \ 1835 F_SEAL_SHRINK | \ 1836 F_SEAL_GROW | \ 1837 F_SEAL_WRITE) 1838 1839int shmem_add_seals(struct file *file, unsigned int seals) 1840{ 1841 struct inode *inode = file_inode(file); 1842 struct shmem_inode_info *info = SHMEM_I(inode); 1843 int error; 1844 1845 /* 1846 * SEALING 1847 * Sealing allows multiple parties to share a shmem-file but restrict 1848 * access to a specific subset of file operations. Seals can only be 1849 * added, but never removed. This way, mutually untrusted parties can 1850 * share common memory regions with a well-defined policy. A malicious 1851 * peer can thus never perform unwanted operations on a shared object. 1852 * 1853 * Seals are only supported on special shmem-files and always affect 1854 * the whole underlying inode. Once a seal is set, it may prevent some 1855 * kinds of access to the file. Currently, the following seals are 1856 * defined: 1857 * SEAL_SEAL: Prevent further seals from being set on this file 1858 * SEAL_SHRINK: Prevent the file from shrinking 1859 * SEAL_GROW: Prevent the file from growing 1860 * SEAL_WRITE: Prevent write access to the file 1861 * 1862 * As we don't require any trust relationship between two parties, we 1863 * must prevent seals from being removed. Therefore, sealing a file 1864 * only adds a given set of seals to the file, it never touches 1865 * existing seals. Furthermore, the "setting seals"-operation can be 1866 * sealed itself, which basically prevents any further seal from being 1867 * added. 1868 * 1869 * Semantics of sealing are only defined on volatile files. Only 1870 * anonymous shmem files support sealing. More importantly, seals are 1871 * never written to disk. Therefore, there's no plan to support it on 1872 * other file types. 1873 */ 1874 1875 if (file->f_op != &shmem_file_operations) 1876 return -EINVAL; 1877 if (!(file->f_mode & FMODE_WRITE)) 1878 return -EPERM; 1879 if (seals & ~(unsigned int)F_ALL_SEALS) 1880 return -EINVAL; 1881 1882 mutex_lock(&inode->i_mutex); 1883 1884 if (info->seals & F_SEAL_SEAL) { 1885 error = -EPERM; 1886 goto unlock; 1887 } 1888 1889 if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) { 1890 error = mapping_deny_writable(file->f_mapping); 1891 if (error) 1892 goto unlock; 1893 1894 error = shmem_wait_for_pins(file->f_mapping); 1895 if (error) { 1896 mapping_allow_writable(file->f_mapping); 1897 goto unlock; 1898 } 1899 } 1900 1901 info->seals |= seals; 1902 error = 0; 1903 1904unlock: 1905 mutex_unlock(&inode->i_mutex); 1906 return error; 1907} 1908EXPORT_SYMBOL_GPL(shmem_add_seals); 1909 1910int shmem_get_seals(struct file *file) 1911{ 1912 if (file->f_op != &shmem_file_operations) 1913 return -EINVAL; 1914 1915 return SHMEM_I(file_inode(file))->seals; 1916} 1917EXPORT_SYMBOL_GPL(shmem_get_seals); 1918 1919long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1920{ 1921 long error; 1922 1923 switch (cmd) { 1924 case F_ADD_SEALS: 1925 /* disallow upper 32bit */ 1926 if (arg > UINT_MAX) 1927 return -EINVAL; 1928 1929 error = shmem_add_seals(file, arg); 1930 break; 1931 case F_GET_SEALS: 1932 error = shmem_get_seals(file); 1933 break; 1934 default: 1935 error = -EINVAL; 1936 break; 1937 } 1938 1939 return error; 1940} 1941 1942static long shmem_fallocate(struct file *file, int mode, loff_t offset, 1943 loff_t len) 1944{ 1945 struct inode *inode = file_inode(file); 1946 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 1947 struct shmem_inode_info *info = SHMEM_I(inode); 1948 struct shmem_falloc shmem_falloc; 1949 pgoff_t start, index, end; 1950 int error; 1951 1952 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 1953 return -EOPNOTSUPP; 1954 1955 mutex_lock(&inode->i_mutex); 1956 1957 if (mode & FALLOC_FL_PUNCH_HOLE) { 1958 struct address_space *mapping = file->f_mapping; 1959 loff_t unmap_start = round_up(offset, PAGE_SIZE); 1960 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; 1961 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); 1962 1963 /* protected by i_mutex */ 1964 if (info->seals & F_SEAL_WRITE) { 1965 error = -EPERM; 1966 goto out; 1967 } 1968 1969 shmem_falloc.waitq = &shmem_falloc_waitq; 1970 shmem_falloc.start = unmap_start >> PAGE_SHIFT; 1971 shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; 1972 spin_lock(&inode->i_lock); 1973 inode->i_private = &shmem_falloc; 1974 spin_unlock(&inode->i_lock); 1975 1976 if ((u64)unmap_end > (u64)unmap_start) 1977 unmap_mapping_range(mapping, unmap_start, 1978 1 + unmap_end - unmap_start, 0); 1979 shmem_truncate_range(inode, offset, offset + len - 1); 1980 /* No need to unmap again: hole-punching leaves COWed pages */ 1981 1982 spin_lock(&inode->i_lock); 1983 inode->i_private = NULL; 1984 wake_up_all(&shmem_falloc_waitq); 1985 spin_unlock(&inode->i_lock); 1986 error = 0; 1987 goto out; 1988 } 1989 1990 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ 1991 error = inode_newsize_ok(inode, offset + len); 1992 if (error) 1993 goto out; 1994 1995 if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { 1996 error = -EPERM; 1997 goto out; 1998 } 1999 2000 start = offset >> PAGE_CACHE_SHIFT; 2001 end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 2002 /* Try to avoid a swapstorm if len is impossible to satisfy */ 2003 if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { 2004 error = -ENOSPC; 2005 goto out; 2006 } 2007 2008 shmem_falloc.waitq = NULL; 2009 shmem_falloc.start = start; 2010 shmem_falloc.next = start; 2011 shmem_falloc.nr_falloced = 0; 2012 shmem_falloc.nr_unswapped = 0; 2013 spin_lock(&inode->i_lock); 2014 inode->i_private = &shmem_falloc; 2015 spin_unlock(&inode->i_lock); 2016 2017 for (index = start; index < end; index++) { 2018 struct page *page; 2019 2020 /* 2021 * Good, the fallocate(2) manpage permits EINTR: we may have 2022 * been interrupted because we are using up too much memory. 2023 */ 2024 if (signal_pending(current)) 2025 error = -EINTR; 2026 else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) 2027 error = -ENOMEM; 2028 else 2029 error = shmem_getpage(inode, index, &page, SGP_FALLOC, 2030 NULL); 2031 if (error) { 2032 /* Remove the !PageUptodate pages we added */ 2033 shmem_undo_range(inode, 2034 (loff_t)start << PAGE_CACHE_SHIFT, 2035 (loff_t)index << PAGE_CACHE_SHIFT, true); 2036 goto undone; 2037 } 2038 2039 /* 2040 * Inform shmem_writepage() how far we have reached. 2041 * No need for lock or barrier: we have the page lock. 2042 */ 2043 shmem_falloc.next++; 2044 if (!PageUptodate(page)) 2045 shmem_falloc.nr_falloced++; 2046 2047 /* 2048 * If !PageUptodate, leave it that way so that freeable pages 2049 * can be recognized if we need to rollback on error later. 2050 * But set_page_dirty so that memory pressure will swap rather 2051 * than free the pages we are allocating (and SGP_CACHE pages 2052 * might still be clean: we now need to mark those dirty too). 2053 */ 2054 set_page_dirty(page); 2055 unlock_page(page); 2056 page_cache_release(page); 2057 cond_resched(); 2058 } 2059 2060 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) 2061 i_size_write(inode, offset + len); 2062 inode->i_ctime = CURRENT_TIME; 2063undone: 2064 spin_lock(&inode->i_lock); 2065 inode->i_private = NULL; 2066 spin_unlock(&inode->i_lock); 2067out: 2068 mutex_unlock(&inode->i_mutex); 2069 return error; 2070} 2071 2072static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 2073{ 2074 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 2075 2076 buf->f_type = TMPFS_MAGIC; 2077 buf->f_bsize = PAGE_CACHE_SIZE; 2078 buf->f_namelen = NAME_MAX; 2079 if (sbinfo->max_blocks) { 2080 buf->f_blocks = sbinfo->max_blocks; 2081 buf->f_bavail = 2082 buf->f_bfree = sbinfo->max_blocks - 2083 percpu_counter_sum(&sbinfo->used_blocks); 2084 } 2085 if (sbinfo->max_inodes) { 2086 buf->f_files = sbinfo->max_inodes; 2087 buf->f_ffree = sbinfo->free_inodes; 2088 } 2089 /* else leave those fields 0 like simple_statfs */ 2090 return 0; 2091} 2092 2093/* 2094 * File creation. Allocate an inode, and we're done.. 2095 */ 2096static int 2097shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) 2098{ 2099 struct inode *inode; 2100 int error = -ENOSPC; 2101 2102 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); 2103 if (inode) { 2104 error = simple_acl_create(dir, inode); 2105 if (error) 2106 goto out_iput; 2107 error = security_inode_init_security(inode, dir, 2108 &dentry->d_name, 2109 shmem_initxattrs, NULL); 2110 if (error && error != -EOPNOTSUPP) 2111 goto out_iput; 2112 2113 error = 0; 2114 dir->i_size += BOGO_DIRENT_SIZE; 2115 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 2116 d_instantiate(dentry, inode); 2117 dget(dentry); /* Extra count - pin the dentry in core */ 2118 } 2119 return error; 2120out_iput: 2121 iput(inode); 2122 return error; 2123} 2124 2125static int 2126shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) 2127{ 2128 struct inode *inode; 2129 int error = -ENOSPC; 2130 2131 inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE); 2132 if (inode) { 2133 error = security_inode_init_security(inode, dir, 2134 NULL, 2135 shmem_initxattrs, NULL); 2136 if (error && error != -EOPNOTSUPP) 2137 goto out_iput; 2138 error = simple_acl_create(dir, inode); 2139 if (error) 2140 goto out_iput; 2141 d_tmpfile(dentry, inode); 2142 } 2143 return error; 2144out_iput: 2145 iput(inode); 2146 return error; 2147} 2148 2149static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 2150{ 2151 int error; 2152 2153 if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) 2154 return error; 2155 inc_nlink(dir); 2156 return 0; 2157} 2158 2159static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, 2160 bool excl) 2161{ 2162 return shmem_mknod(dir, dentry, mode | S_IFREG, 0); 2163} 2164 2165/* 2166 * Link a file.. 2167 */ 2168static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 2169{ 2170 struct inode *inode = old_dentry->d_inode; 2171 int ret; 2172 2173 /* 2174 * No ordinary (disk based) filesystem counts links as inodes; 2175 * but each new link needs a new dentry, pinning lowmem, and 2176 * tmpfs dentries cannot be pruned until they are unlinked. 2177 */ 2178 ret = shmem_reserve_inode(inode->i_sb); 2179 if (ret) 2180 goto out; 2181 2182 dir->i_size += BOGO_DIRENT_SIZE; 2183 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 2184 inc_nlink(inode); 2185 ihold(inode); /* New dentry reference */ 2186 dget(dentry); /* Extra pinning count for the created dentry */ 2187 d_instantiate(dentry, inode); 2188out: 2189 return ret; 2190} 2191 2192static int shmem_unlink(struct inode *dir, struct dentry *dentry) 2193{ 2194 struct inode *inode = dentry->d_inode; 2195 2196 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) 2197 shmem_free_inode(inode->i_sb); 2198 2199 dir->i_size -= BOGO_DIRENT_SIZE; 2200 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 2201 drop_nlink(inode); 2202 dput(dentry); /* Undo the count from "create" - this does all the work */ 2203 return 0; 2204} 2205 2206static int shmem_rmdir(struct inode *dir, struct dentry *dentry) 2207{ 2208 if (!simple_empty(dentry)) 2209 return -ENOTEMPTY; 2210 2211 drop_nlink(dentry->d_inode); 2212 drop_nlink(dir); 2213 return shmem_unlink(dir, dentry); 2214} 2215 2216/* 2217 * The VFS layer already does all the dentry stuff for rename, 2218 * we just have to decrement the usage count for the target if 2219 * it exists so that the VFS layer correctly free's it when it 2220 * gets overwritten. 2221 */ 2222static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) 2223{ 2224 struct inode *inode = old_dentry->d_inode; 2225 int they_are_dirs = S_ISDIR(inode->i_mode); 2226 2227 if (!simple_empty(new_dentry)) 2228 return -ENOTEMPTY; 2229 2230 if (new_dentry->d_inode) { 2231 (void) shmem_unlink(new_dir, new_dentry); 2232 if (they_are_dirs) 2233 drop_nlink(old_dir); 2234 } else if (they_are_dirs) { 2235 drop_nlink(old_dir); 2236 inc_nlink(new_dir); 2237 } 2238 2239 old_dir->i_size -= BOGO_DIRENT_SIZE; 2240 new_dir->i_size += BOGO_DIRENT_SIZE; 2241 old_dir->i_ctime = old_dir->i_mtime = 2242 new_dir->i_ctime = new_dir->i_mtime = 2243 inode->i_ctime = CURRENT_TIME; 2244 return 0; 2245} 2246 2247static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname) 2248{ 2249 int error; 2250 int len; 2251 struct inode *inode; 2252 struct page *page; 2253 char *kaddr; 2254 struct shmem_inode_info *info; 2255 2256 len = strlen(symname) + 1; 2257 if (len > PAGE_CACHE_SIZE) 2258 return -ENAMETOOLONG; 2259 2260 inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); 2261 if (!inode) 2262 return -ENOSPC; 2263 2264 error = security_inode_init_security(inode, dir, &dentry->d_name, 2265 shmem_initxattrs, NULL); 2266 if (error) { 2267 if (error != -EOPNOTSUPP) { 2268 iput(inode); 2269 return error; 2270 } 2271 error = 0; 2272 } 2273 2274 info = SHMEM_I(inode); 2275 inode->i_size = len-1; 2276 if (len <= SHORT_SYMLINK_LEN) { 2277 info->symlink = kmemdup(symname, len, GFP_KERNEL); 2278 if (!info->symlink) { 2279 iput(inode); 2280 return -ENOMEM; 2281 } 2282 inode->i_op = &shmem_short_symlink_operations; 2283 } else { 2284 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); 2285 if (error) { 2286 iput(inode); 2287 return error; 2288 } 2289 inode->i_mapping->a_ops = &shmem_aops; 2290 inode->i_op = &shmem_symlink_inode_operations; 2291 kaddr = kmap_atomic(page); 2292 memcpy(kaddr, symname, len); 2293 kunmap_atomic(kaddr); 2294 SetPageUptodate(page); 2295 set_page_dirty(page); 2296 unlock_page(page); 2297 page_cache_release(page); 2298 } 2299 dir->i_size += BOGO_DIRENT_SIZE; 2300 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 2301 d_instantiate(dentry, inode); 2302 dget(dentry); 2303 return 0; 2304} 2305 2306static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd) 2307{ 2308 nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink); 2309 return NULL; 2310} 2311 2312static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) 2313{ 2314 struct page *page = NULL; 2315 int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); 2316 nd_set_link(nd, error ? ERR_PTR(error) : kmap(page)); 2317 if (page) 2318 unlock_page(page); 2319 return page; 2320} 2321 2322static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) 2323{ 2324 if (!IS_ERR(nd_get_link(nd))) { 2325 struct page *page = cookie; 2326 kunmap(page); 2327 mark_page_accessed(page); 2328 page_cache_release(page); 2329 } 2330} 2331 2332#ifdef CONFIG_TMPFS_XATTR 2333/* 2334 * Superblocks without xattr inode operations may get some security.* xattr 2335 * support from the LSM "for free". As soon as we have any other xattrs 2336 * like ACLs, we also need to implement the security.* handlers at 2337 * filesystem level, though. 2338 */ 2339 2340/* 2341 * Callback for security_inode_init_security() for acquiring xattrs. 2342 */ 2343static int shmem_initxattrs(struct inode *inode, 2344 const struct xattr *xattr_array, 2345 void *fs_info) 2346{ 2347 struct shmem_inode_info *info = SHMEM_I(inode); 2348 const struct xattr *xattr; 2349 struct simple_xattr *new_xattr; 2350 size_t len; 2351 2352 for (xattr = xattr_array; xattr->name != NULL; xattr++) { 2353 new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); 2354 if (!new_xattr) 2355 return -ENOMEM; 2356 2357 len = strlen(xattr->name) + 1; 2358 new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, 2359 GFP_KERNEL); 2360 if (!new_xattr->name) { 2361 kfree(new_xattr); 2362 return -ENOMEM; 2363 } 2364 2365 memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, 2366 XATTR_SECURITY_PREFIX_LEN); 2367 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, 2368 xattr->name, len); 2369 2370 simple_xattr_list_add(&info->xattrs, new_xattr); 2371 } 2372 2373 return 0; 2374} 2375 2376static const struct xattr_handler *shmem_xattr_handlers[] = { 2377#ifdef CONFIG_TMPFS_POSIX_ACL 2378 &posix_acl_access_xattr_handler, 2379 &posix_acl_default_xattr_handler, 2380#endif 2381 NULL 2382}; 2383 2384static int shmem_xattr_validate(const char *name) 2385{ 2386 struct { const char *prefix; size_t len; } arr[] = { 2387 { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN }, 2388 { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN } 2389 }; 2390 int i; 2391 2392 for (i = 0; i < ARRAY_SIZE(arr); i++) { 2393 size_t preflen = arr[i].len; 2394 if (strncmp(name, arr[i].prefix, preflen) == 0) { 2395 if (!name[preflen]) 2396 return -EINVAL; 2397 return 0; 2398 } 2399 } 2400 return -EOPNOTSUPP; 2401} 2402 2403static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, 2404 void *buffer, size_t size) 2405{ 2406 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); 2407 int err; 2408 2409 /* 2410 * If this is a request for a synthetic attribute in the system.* 2411 * namespace use the generic infrastructure to resolve a handler 2412 * for it via sb->s_xattr. 2413 */ 2414 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 2415 return generic_getxattr(dentry, name, buffer, size); 2416 2417 err = shmem_xattr_validate(name); 2418 if (err) 2419 return err; 2420 2421 return simple_xattr_get(&info->xattrs, name, buffer, size); 2422} 2423 2424static int shmem_setxattr(struct dentry *dentry, const char *name, 2425 const void *value, size_t size, int flags) 2426{ 2427 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); 2428 int err; 2429 2430 /* 2431 * If this is a request for a synthetic attribute in the system.* 2432 * namespace use the generic infrastructure to resolve a handler 2433 * for it via sb->s_xattr. 2434 */ 2435 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 2436 return generic_setxattr(dentry, name, value, size, flags); 2437 2438 err = shmem_xattr_validate(name); 2439 if (err) 2440 return err; 2441 2442 return simple_xattr_set(&info->xattrs, name, value, size, flags); 2443} 2444 2445static int shmem_removexattr(struct dentry *dentry, const char *name) 2446{ 2447 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); 2448 int err; 2449 2450 /* 2451 * If this is a request for a synthetic attribute in the system.* 2452 * namespace use the generic infrastructure to resolve a handler 2453 * for it via sb->s_xattr. 2454 */ 2455 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 2456 return generic_removexattr(dentry, name); 2457 2458 err = shmem_xattr_validate(name); 2459 if (err) 2460 return err; 2461 2462 return simple_xattr_remove(&info->xattrs, name); 2463} 2464 2465static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) 2466{ 2467 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); 2468 return simple_xattr_list(&info->xattrs, buffer, size); 2469} 2470#endif /* CONFIG_TMPFS_XATTR */ 2471 2472static const struct inode_operations shmem_short_symlink_operations = { 2473 .readlink = generic_readlink, 2474 .follow_link = shmem_follow_short_symlink, 2475#ifdef CONFIG_TMPFS_XATTR 2476 .setxattr = shmem_setxattr, 2477 .getxattr = shmem_getxattr, 2478 .listxattr = shmem_listxattr, 2479 .removexattr = shmem_removexattr, 2480#endif 2481}; 2482 2483static const struct inode_operations shmem_symlink_inode_operations = { 2484 .readlink = generic_readlink, 2485 .follow_link = shmem_follow_link, 2486 .put_link = shmem_put_link, 2487#ifdef CONFIG_TMPFS_XATTR 2488 .setxattr = shmem_setxattr, 2489 .getxattr = shmem_getxattr, 2490 .listxattr = shmem_listxattr, 2491 .removexattr = shmem_removexattr, 2492#endif 2493}; 2494 2495static struct dentry *shmem_get_parent(struct dentry *child) 2496{ 2497 return ERR_PTR(-ESTALE); 2498} 2499 2500static int shmem_match(struct inode *ino, void *vfh) 2501{ 2502 __u32 *fh = vfh; 2503 __u64 inum = fh[2]; 2504 inum = (inum << 32) | fh[1]; 2505 return ino->i_ino == inum && fh[0] == ino->i_generation; 2506} 2507 2508static struct dentry *shmem_fh_to_dentry(struct super_block *sb, 2509 struct fid *fid, int fh_len, int fh_type) 2510{ 2511 struct inode *inode; 2512 struct dentry *dentry = NULL; 2513 u64 inum; 2514 2515 if (fh_len < 3) 2516 return NULL; 2517 2518 inum = fid->raw[2]; 2519 inum = (inum << 32) | fid->raw[1]; 2520 2521 inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), 2522 shmem_match, fid->raw); 2523 if (inode) { 2524 dentry = d_find_alias(inode); 2525 iput(inode); 2526 } 2527 2528 return dentry; 2529} 2530 2531static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, 2532 struct inode *parent) 2533{ 2534 if (*len < 3) { 2535 *len = 3; 2536 return FILEID_INVALID; 2537 } 2538 2539 if (inode_unhashed(inode)) { 2540 /* Unfortunately insert_inode_hash is not idempotent, 2541 * so as we hash inodes here rather than at creation 2542 * time, we need a lock to ensure we only try 2543 * to do it once 2544 */ 2545 static DEFINE_SPINLOCK(lock); 2546 spin_lock(&lock); 2547 if (inode_unhashed(inode)) 2548 __insert_inode_hash(inode, 2549 inode->i_ino + inode->i_generation); 2550 spin_unlock(&lock); 2551 } 2552 2553 fh[0] = inode->i_generation; 2554 fh[1] = inode->i_ino; 2555 fh[2] = ((__u64)inode->i_ino) >> 32; 2556 2557 *len = 3; 2558 return 1; 2559} 2560 2561static const struct export_operations shmem_export_ops = { 2562 .get_parent = shmem_get_parent, 2563 .encode_fh = shmem_encode_fh, 2564 .fh_to_dentry = shmem_fh_to_dentry, 2565}; 2566 2567static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, 2568 bool remount) 2569{ 2570 char *this_char, *value, *rest; 2571 struct mempolicy *mpol = NULL; 2572 uid_t uid; 2573 gid_t gid; 2574 2575 while (options != NULL) { 2576 this_char = options; 2577 for (;;) { 2578 /* 2579 * NUL-terminate this option: unfortunately, 2580 * mount options form a comma-separated list, 2581 * but mpol's nodelist may also contain commas. 2582 */ 2583 options = strchr(options, ','); 2584 if (options == NULL) 2585 break; 2586 options++; 2587 if (!isdigit(*options)) { 2588 options[-1] = '\0'; 2589 break; 2590 } 2591 } 2592 if (!*this_char) 2593 continue; 2594 if ((value = strchr(this_char,'=')) != NULL) { 2595 *value++ = 0; 2596 } else { 2597 printk(KERN_ERR 2598 "tmpfs: No value for mount option '%s'\n", 2599 this_char); 2600 goto error; 2601 } 2602 2603 if (!strcmp(this_char,"size")) { 2604 unsigned long long size; 2605 size = memparse(value,&rest); 2606 if (*rest == '%') { 2607 size <<= PAGE_SHIFT; 2608 size *= totalram_pages; 2609 do_div(size, 100); 2610 rest++; 2611 } 2612 if (*rest) 2613 goto bad_val; 2614 sbinfo->max_blocks = 2615 DIV_ROUND_UP(size, PAGE_CACHE_SIZE); 2616 } else if (!strcmp(this_char,"nr_blocks")) { 2617 sbinfo->max_blocks = memparse(value, &rest); 2618 if (*rest) 2619 goto bad_val; 2620 } else if (!strcmp(this_char,"nr_inodes")) { 2621 sbinfo->max_inodes = memparse(value, &rest); 2622 if (*rest) 2623 goto bad_val; 2624 } else if (!strcmp(this_char,"mode")) { 2625 if (remount) 2626 continue; 2627 sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777; 2628 if (*rest) 2629 goto bad_val; 2630 } else if (!strcmp(this_char,"uid")) { 2631 if (remount) 2632 continue; 2633 uid = simple_strtoul(value, &rest, 0); 2634 if (*rest) 2635 goto bad_val; 2636 sbinfo->uid = make_kuid(current_user_ns(), uid); 2637 if (!uid_valid(sbinfo->uid)) 2638 goto bad_val; 2639 } else if (!strcmp(this_char,"gid")) { 2640 if (remount) 2641 continue; 2642 gid = simple_strtoul(value, &rest, 0); 2643 if (*rest) 2644 goto bad_val; 2645 sbinfo->gid = make_kgid(current_user_ns(), gid); 2646 if (!gid_valid(sbinfo->gid)) 2647 goto bad_val; 2648 } else if (!strcmp(this_char,"mpol")) { 2649 mpol_put(mpol); 2650 mpol = NULL; 2651 if (mpol_parse_str(value, &mpol)) 2652 goto bad_val; 2653 } else { 2654 printk(KERN_ERR "tmpfs: Bad mount option %s\n", 2655 this_char); 2656 goto error; 2657 } 2658 } 2659 sbinfo->mpol = mpol; 2660 return 0; 2661 2662bad_val: 2663 printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", 2664 value, this_char); 2665error: 2666 mpol_put(mpol); 2667 return 1; 2668 2669} 2670 2671static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) 2672{ 2673 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2674 struct shmem_sb_info config = *sbinfo; 2675 unsigned long inodes; 2676 int error = -EINVAL; 2677 2678 config.mpol = NULL; 2679 if (shmem_parse_options(data, &config, true)) 2680 return error; 2681 2682 spin_lock(&sbinfo->stat_lock); 2683 inodes = sbinfo->max_inodes - sbinfo->free_inodes; 2684 if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0) 2685 goto out; 2686 if (config.max_inodes < inodes) 2687 goto out; 2688 /* 2689 * Those tests disallow limited->unlimited while any are in use; 2690 * but we must separately disallow unlimited->limited, because 2691 * in that case we have no record of how much is already in use. 2692 */ 2693 if (config.max_blocks && !sbinfo->max_blocks) 2694 goto out; 2695 if (config.max_inodes && !sbinfo->max_inodes) 2696 goto out; 2697 2698 error = 0; 2699 sbinfo->max_blocks = config.max_blocks; 2700 sbinfo->max_inodes = config.max_inodes; 2701 sbinfo->free_inodes = config.max_inodes - inodes; 2702 2703 /* 2704 * Preserve previous mempolicy unless mpol remount option was specified. 2705 */ 2706 if (config.mpol) { 2707 mpol_put(sbinfo->mpol); 2708 sbinfo->mpol = config.mpol; /* transfers initial ref */ 2709 } 2710out: 2711 spin_unlock(&sbinfo->stat_lock); 2712 return error; 2713} 2714 2715static int shmem_show_options(struct seq_file *seq, struct dentry *root) 2716{ 2717 struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); 2718 2719 if (sbinfo->max_blocks != shmem_default_max_blocks()) 2720 seq_printf(seq, ",size=%luk", 2721 sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10)); 2722 if (sbinfo->max_inodes != shmem_default_max_inodes()) 2723 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); 2724 if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) 2725 seq_printf(seq, ",mode=%03ho", sbinfo->mode); 2726 if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) 2727 seq_printf(seq, ",uid=%u", 2728 from_kuid_munged(&init_user_ns, sbinfo->uid)); 2729 if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) 2730 seq_printf(seq, ",gid=%u", 2731 from_kgid_munged(&init_user_ns, sbinfo->gid)); 2732 shmem_show_mpol(seq, sbinfo->mpol); 2733 return 0; 2734} 2735#endif /* CONFIG_TMPFS */ 2736 2737static void shmem_put_super(struct super_block *sb) 2738{ 2739 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2740 2741 percpu_counter_destroy(&sbinfo->used_blocks); 2742 mpol_put(sbinfo->mpol); 2743 kfree(sbinfo); 2744 sb->s_fs_info = NULL; 2745} 2746 2747int shmem_fill_super(struct super_block *sb, void *data, int silent) 2748{ 2749 struct inode *inode; 2750 struct shmem_sb_info *sbinfo; 2751 int err = -ENOMEM; 2752 2753 /* Round up to L1_CACHE_BYTES to resist false sharing */ 2754 sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), 2755 L1_CACHE_BYTES), GFP_KERNEL); 2756 if (!sbinfo) 2757 return -ENOMEM; 2758 2759 sbinfo->mode = S_IRWXUGO | S_ISVTX; 2760 sbinfo->uid = current_fsuid(); 2761 sbinfo->gid = current_fsgid(); 2762 sb->s_fs_info = sbinfo; 2763 2764#ifdef CONFIG_TMPFS 2765 /* 2766 * Per default we only allow half of the physical ram per 2767 * tmpfs instance, limiting inodes to one per page of lowmem; 2768 * but the internal instance is left unlimited. 2769 */ 2770 if (!(sb->s_flags & MS_KERNMOUNT)) { 2771 sbinfo->max_blocks = shmem_default_max_blocks(); 2772 sbinfo->max_inodes = shmem_default_max_inodes(); 2773 if (shmem_parse_options(data, sbinfo, false)) { 2774 err = -EINVAL; 2775 goto failed; 2776 } 2777 } else { 2778 sb->s_flags |= MS_NOUSER; 2779 } 2780 sb->s_export_op = &shmem_export_ops; 2781 sb->s_flags |= MS_NOSEC; 2782#else 2783 sb->s_flags |= MS_NOUSER; 2784#endif 2785 2786 spin_lock_init(&sbinfo->stat_lock); 2787 if (percpu_counter_init(&sbinfo->used_blocks, 0)) 2788 goto failed; 2789 sbinfo->free_inodes = sbinfo->max_inodes; 2790 2791 sb->s_maxbytes = MAX_LFS_FILESIZE; 2792 sb->s_blocksize = PAGE_CACHE_SIZE; 2793 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 2794 sb->s_magic = TMPFS_MAGIC; 2795 sb->s_op = &shmem_ops; 2796 sb->s_time_gran = 1; 2797#ifdef CONFIG_TMPFS_XATTR 2798 sb->s_xattr = shmem_xattr_handlers; 2799#endif 2800#ifdef CONFIG_TMPFS_POSIX_ACL 2801 sb->s_flags |= MS_POSIXACL; 2802#endif 2803 2804 inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); 2805 if (!inode) 2806 goto failed; 2807 inode->i_uid = sbinfo->uid; 2808 inode->i_gid = sbinfo->gid; 2809 sb->s_root = d_make_root(inode); 2810 if (!sb->s_root) 2811 goto failed; 2812 return 0; 2813 2814failed: 2815 shmem_put_super(sb); 2816 return err; 2817} 2818 2819static struct kmem_cache *shmem_inode_cachep; 2820 2821static struct inode *shmem_alloc_inode(struct super_block *sb) 2822{ 2823 struct shmem_inode_info *info; 2824 info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); 2825 if (!info) 2826 return NULL; 2827 return &info->vfs_inode; 2828} 2829 2830static void shmem_destroy_callback(struct rcu_head *head) 2831{ 2832 struct inode *inode = container_of(head, struct inode, i_rcu); 2833 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2834} 2835 2836static void shmem_destroy_inode(struct inode *inode) 2837{ 2838 if (S_ISREG(inode->i_mode)) 2839 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 2840 call_rcu(&inode->i_rcu, shmem_destroy_callback); 2841} 2842 2843static void shmem_init_inode(void *foo) 2844{ 2845 struct shmem_inode_info *info = foo; 2846 inode_init_once(&info->vfs_inode); 2847} 2848 2849static int shmem_init_inodecache(void) 2850{ 2851 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 2852 sizeof(struct shmem_inode_info), 2853 0, SLAB_PANIC, shmem_init_inode); 2854 return 0; 2855} 2856 2857static void shmem_destroy_inodecache(void) 2858{ 2859 kmem_cache_destroy(shmem_inode_cachep); 2860} 2861 2862static const struct address_space_operations shmem_aops = { 2863 .writepage = shmem_writepage, 2864 .set_page_dirty = __set_page_dirty_no_writeback, 2865#ifdef CONFIG_TMPFS 2866 .write_begin = shmem_write_begin, 2867 .write_end = shmem_write_end, 2868#endif 2869 .migratepage = migrate_page, 2870 .error_remove_page = generic_error_remove_page, 2871}; 2872 2873static const struct file_operations shmem_file_operations = { 2874 .mmap = shmem_mmap, 2875#ifdef CONFIG_TMPFS 2876 .llseek = shmem_file_llseek, 2877 .read = new_sync_read, 2878 .write = new_sync_write, 2879 .read_iter = shmem_file_read_iter, 2880 .write_iter = generic_file_write_iter, 2881 .fsync = noop_fsync, 2882 .splice_read = shmem_file_splice_read, 2883 .splice_write = iter_file_splice_write, 2884 .fallocate = shmem_fallocate, 2885#endif 2886}; 2887 2888static const struct inode_operations shmem_inode_operations = { 2889 .setattr = shmem_setattr, 2890#ifdef CONFIG_TMPFS_XATTR 2891 .setxattr = shmem_setxattr, 2892 .getxattr = shmem_getxattr, 2893 .listxattr = shmem_listxattr, 2894 .removexattr = shmem_removexattr, 2895 .set_acl = simple_set_acl, 2896#endif 2897}; 2898 2899static const struct inode_operations shmem_dir_inode_operations = { 2900#ifdef CONFIG_TMPFS 2901 .create = shmem_create, 2902 .lookup = simple_lookup, 2903 .link = shmem_link, 2904 .unlink = shmem_unlink, 2905 .symlink = shmem_symlink, 2906 .mkdir = shmem_mkdir, 2907 .rmdir = shmem_rmdir, 2908 .mknod = shmem_mknod, 2909 .rename = shmem_rename, 2910 .tmpfile = shmem_tmpfile, 2911#endif 2912#ifdef CONFIG_TMPFS_XATTR 2913 .setxattr = shmem_setxattr, 2914 .getxattr = shmem_getxattr, 2915 .listxattr = shmem_listxattr, 2916 .removexattr = shmem_removexattr, 2917#endif 2918#ifdef CONFIG_TMPFS_POSIX_ACL 2919 .setattr = shmem_setattr, 2920 .set_acl = simple_set_acl, 2921#endif 2922}; 2923 2924static const struct inode_operations shmem_special_inode_operations = { 2925#ifdef CONFIG_TMPFS_XATTR 2926 .setxattr = shmem_setxattr, 2927 .getxattr = shmem_getxattr, 2928 .listxattr = shmem_listxattr, 2929 .removexattr = shmem_removexattr, 2930#endif 2931#ifdef CONFIG_TMPFS_POSIX_ACL 2932 .setattr = shmem_setattr, 2933 .set_acl = simple_set_acl, 2934#endif 2935}; 2936 2937static const struct super_operations shmem_ops = { 2938 .alloc_inode = shmem_alloc_inode, 2939 .destroy_inode = shmem_destroy_inode, 2940#ifdef CONFIG_TMPFS 2941 .statfs = shmem_statfs, 2942 .remount_fs = shmem_remount_fs, 2943 .show_options = shmem_show_options, 2944#endif 2945 .evict_inode = shmem_evict_inode, 2946 .drop_inode = generic_delete_inode, 2947 .put_super = shmem_put_super, 2948}; 2949 2950static const struct vm_operations_struct shmem_vm_ops = { 2951 .fault = shmem_fault, 2952 .map_pages = filemap_map_pages, 2953#ifdef CONFIG_NUMA 2954 .set_policy = shmem_set_policy, 2955 .get_policy = shmem_get_policy, 2956#endif 2957 .remap_pages = generic_file_remap_pages, 2958}; 2959 2960static struct dentry *shmem_mount(struct file_system_type *fs_type, 2961 int flags, const char *dev_name, void *data) 2962{ 2963 return mount_nodev(fs_type, flags, data, shmem_fill_super); 2964} 2965 2966static struct file_system_type shmem_fs_type = { 2967 .owner = THIS_MODULE, 2968 .name = "tmpfs", 2969 .mount = shmem_mount, 2970 .kill_sb = kill_litter_super, 2971 .fs_flags = FS_USERNS_MOUNT, 2972}; 2973 2974int __init shmem_init(void) 2975{ 2976 int error; 2977 2978 /* If rootfs called this, don't re-init */ 2979 if (shmem_inode_cachep) 2980 return 0; 2981 2982 error = bdi_init(&shmem_backing_dev_info); 2983 if (error) 2984 goto out4; 2985 2986 error = shmem_init_inodecache(); 2987 if (error) 2988 goto out3; 2989 2990 error = register_filesystem(&shmem_fs_type); 2991 if (error) { 2992 printk(KERN_ERR "Could not register tmpfs\n"); 2993 goto out2; 2994 } 2995 2996 shm_mnt = kern_mount(&shmem_fs_type); 2997 if (IS_ERR(shm_mnt)) { 2998 error = PTR_ERR(shm_mnt); 2999 printk(KERN_ERR "Could not kern_mount tmpfs\n"); 3000 goto out1; 3001 } 3002 return 0; 3003 3004out1: 3005 unregister_filesystem(&shmem_fs_type); 3006out2: 3007 shmem_destroy_inodecache(); 3008out3: 3009 bdi_destroy(&shmem_backing_dev_info); 3010out4: 3011 shm_mnt = ERR_PTR(error); 3012 return error; 3013} 3014 3015#else /* !CONFIG_SHMEM */ 3016 3017/* 3018 * tiny-shmem: simple shmemfs and tmpfs using ramfs code 3019 * 3020 * This is intended for small system where the benefits of the full 3021 * shmem code (swap-backed and resource-limited) are outweighed by 3022 * their complexity. On systems without swap this code should be 3023 * effectively equivalent, but much lighter weight. 3024 */ 3025 3026static struct file_system_type shmem_fs_type = { 3027 .name = "tmpfs", 3028 .mount = ramfs_mount, 3029 .kill_sb = kill_litter_super, 3030 .fs_flags = FS_USERNS_MOUNT, 3031}; 3032 3033int __init shmem_init(void) 3034{ 3035 BUG_ON(register_filesystem(&shmem_fs_type) != 0); 3036 3037 shm_mnt = kern_mount(&shmem_fs_type); 3038 BUG_ON(IS_ERR(shm_mnt)); 3039 3040 return 0; 3041} 3042 3043int shmem_unuse(swp_entry_t swap, struct page *page) 3044{ 3045 return 0; 3046} 3047 3048int shmem_lock(struct file *file, int lock, struct user_struct *user) 3049{ 3050 return 0; 3051} 3052 3053void shmem_unlock_mapping(struct address_space *mapping) 3054{ 3055} 3056 3057void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 3058{ 3059 truncate_inode_pages_range(inode->i_mapping, lstart, lend); 3060} 3061EXPORT_SYMBOL_GPL(shmem_truncate_range); 3062 3063#define shmem_vm_ops generic_file_vm_ops 3064#define shmem_file_operations ramfs_file_operations 3065#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) 3066#define shmem_acct_size(flags, size) 0 3067#define shmem_unacct_size(flags, size) do {} while (0) 3068 3069#endif /* CONFIG_SHMEM */ 3070 3071/* common code */ 3072 3073static struct dentry_operations anon_ops = { 3074 .d_dname = simple_dname 3075}; 3076 3077static struct file *__shmem_file_setup(const char *name, loff_t size, 3078 unsigned long flags, unsigned int i_flags) 3079{ 3080 struct file *res; 3081 struct inode *inode; 3082 struct path path; 3083 struct super_block *sb; 3084 struct qstr this; 3085 3086 if (IS_ERR(shm_mnt)) 3087 return ERR_CAST(shm_mnt); 3088 3089 if (size < 0 || size > MAX_LFS_FILESIZE) 3090 return ERR_PTR(-EINVAL); 3091 3092 if (shmem_acct_size(flags, size)) 3093 return ERR_PTR(-ENOMEM); 3094 3095 res = ERR_PTR(-ENOMEM); 3096 this.name = name; 3097 this.len = strlen(name); 3098 this.hash = 0; /* will go */ 3099 sb = shm_mnt->mnt_sb; 3100 path.mnt = mntget(shm_mnt); 3101 path.dentry = d_alloc_pseudo(sb, &this); 3102 if (!path.dentry) 3103 goto put_memory; 3104 d_set_d_op(path.dentry, &anon_ops); 3105 3106 res = ERR_PTR(-ENOSPC); 3107 inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); 3108 if (!inode) 3109 goto put_memory; 3110 3111 inode->i_flags |= i_flags; 3112 d_instantiate(path.dentry, inode); 3113 inode->i_size = size; 3114 clear_nlink(inode); /* It is unlinked */ 3115 res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); 3116 if (IS_ERR(res)) 3117 goto put_path; 3118 3119 res = alloc_file(&path, FMODE_WRITE | FMODE_READ, 3120 &shmem_file_operations); 3121 if (IS_ERR(res)) 3122 goto put_path; 3123 3124 return res; 3125 3126put_memory: 3127 shmem_unacct_size(flags, size); 3128put_path: 3129 path_put(&path); 3130 return res; 3131} 3132 3133/** 3134 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be 3135 * kernel internal. There will be NO LSM permission checks against the 3136 * underlying inode. So users of this interface must do LSM checks at a 3137 * higher layer. The one user is the big_key implementation. LSM checks 3138 * are provided at the key level rather than the inode level. 3139 * @name: name for dentry (to be seen in /proc/<pid>/maps 3140 * @size: size to be set for the file 3141 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 3142 */ 3143struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) 3144{ 3145 return __shmem_file_setup(name, size, flags, S_PRIVATE); 3146} 3147 3148/** 3149 * shmem_file_setup - get an unlinked file living in tmpfs 3150 * @name: name for dentry (to be seen in /proc/<pid>/maps 3151 * @size: size to be set for the file 3152 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 3153 */ 3154struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) 3155{ 3156 return __shmem_file_setup(name, size, flags, 0); 3157} 3158EXPORT_SYMBOL_GPL(shmem_file_setup); 3159 3160/** 3161 * shmem_zero_setup - setup a shared anonymous mapping 3162 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff 3163 */ 3164int shmem_zero_setup(struct vm_area_struct *vma) 3165{ 3166 struct file *file; 3167 loff_t size = vma->vm_end - vma->vm_start; 3168 3169 file = shmem_file_setup("dev/zero", size, vma->vm_flags); 3170 if (IS_ERR(file)) 3171 return PTR_ERR(file); 3172 3173 if (vma->vm_file) 3174 fput(vma->vm_file); 3175 vma->vm_file = file; 3176 vma->vm_ops = &shmem_vm_ops; 3177 return 0; 3178} 3179 3180/** 3181 * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. 3182 * @mapping: the page's address_space 3183 * @index: the page index 3184 * @gfp: the page allocator flags to use if allocating 3185 * 3186 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", 3187 * with any new page allocations done using the specified allocation flags. 3188 * But read_cache_page_gfp() uses the ->readpage() method: which does not 3189 * suit tmpfs, since it may have pages in swapcache, and needs to find those 3190 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. 3191 * 3192 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in 3193 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. 3194 */ 3195struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 3196 pgoff_t index, gfp_t gfp) 3197{ 3198#ifdef CONFIG_SHMEM 3199 struct inode *inode = mapping->host; 3200 struct page *page; 3201 int error; 3202 3203 BUG_ON(mapping->a_ops != &shmem_aops); 3204 error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL); 3205 if (error) 3206 page = ERR_PTR(error); 3207 else 3208 unlock_page(page); 3209 return page; 3210#else 3211 /* 3212 * The tiny !SHMEM case uses ramfs without swap 3213 */ 3214 return read_cache_page_gfp(mapping, index, gfp); 3215#endif 3216} 3217EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); 3218