shmem.c revision 9183df25fe7b194563db3fec6dc3202a5855839c
1/* 2 * Resizable virtual memory filesystem for Linux. 3 * 4 * Copyright (C) 2000 Linus Torvalds. 5 * 2000 Transmeta Corp. 6 * 2000-2001 Christoph Rohland 7 * 2000-2001 SAP AG 8 * 2002 Red Hat Inc. 9 * Copyright (C) 2002-2011 Hugh Dickins. 10 * Copyright (C) 2011 Google Inc. 11 * Copyright (C) 2002-2005 VERITAS Software Corporation. 12 * Copyright (C) 2004 Andi Kleen, SuSE Labs 13 * 14 * Extended attribute support for tmpfs: 15 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> 16 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> 17 * 18 * tiny-shmem: 19 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> 20 * 21 * This file is released under the GPL. 22 */ 23 24#include <linux/fs.h> 25#include <linux/init.h> 26#include <linux/vfs.h> 27#include <linux/mount.h> 28#include <linux/ramfs.h> 29#include <linux/pagemap.h> 30#include <linux/file.h> 31#include <linux/mm.h> 32#include <linux/export.h> 33#include <linux/swap.h> 34#include <linux/aio.h> 35 36static struct vfsmount *shm_mnt; 37 38#ifdef CONFIG_SHMEM 39/* 40 * This virtual memory filesystem is heavily based on the ramfs. It 41 * extends ramfs by the ability to use swap and honor resource limits 42 * which makes it a completely usable filesystem. 43 */ 44 45#include <linux/xattr.h> 46#include <linux/exportfs.h> 47#include <linux/posix_acl.h> 48#include <linux/posix_acl_xattr.h> 49#include <linux/mman.h> 50#include <linux/string.h> 51#include <linux/slab.h> 52#include <linux/backing-dev.h> 53#include <linux/shmem_fs.h> 54#include <linux/writeback.h> 55#include <linux/blkdev.h> 56#include <linux/pagevec.h> 57#include <linux/percpu_counter.h> 58#include <linux/falloc.h> 59#include <linux/splice.h> 60#include <linux/security.h> 61#include <linux/swapops.h> 62#include <linux/mempolicy.h> 63#include <linux/namei.h> 64#include <linux/ctype.h> 65#include <linux/migrate.h> 66#include <linux/highmem.h> 67#include <linux/seq_file.h> 68#include <linux/magic.h> 69#include <linux/syscalls.h> 70#include <linux/fcntl.h> 71#include <uapi/linux/memfd.h> 72 73#include <asm/uaccess.h> 74#include <asm/pgtable.h> 75 76#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) 77#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) 78 79/* Pretend that each entry is of this size in directory's i_size */ 80#define BOGO_DIRENT_SIZE 20 81 82/* Symlink up to this size is kmalloc'ed instead of using a swappable page */ 83#define SHORT_SYMLINK_LEN 128 84 85/* 86 * shmem_fallocate communicates with shmem_fault or shmem_writepage via 87 * inode->i_private (with i_mutex making sure that it has only one user at 88 * a time): we would prefer not to enlarge the shmem inode just for that. 89 */ 90struct shmem_falloc { 91 wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ 92 pgoff_t start; /* start of range currently being fallocated */ 93 pgoff_t next; /* the next page offset to be fallocated */ 94 pgoff_t nr_falloced; /* how many new pages have been fallocated */ 95 pgoff_t nr_unswapped; /* how often writepage refused to swap out */ 96}; 97 98/* Flag allocation requirements to shmem_getpage */ 99enum sgp_type { 100 SGP_READ, /* don't exceed i_size, don't allocate page */ 101 SGP_CACHE, /* don't exceed i_size, may allocate page */ 102 SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ 103 SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ 104 SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ 105}; 106 107#ifdef CONFIG_TMPFS 108static unsigned long shmem_default_max_blocks(void) 109{ 110 return totalram_pages / 2; 111} 112 113static unsigned long shmem_default_max_inodes(void) 114{ 115 return min(totalram_pages - totalhigh_pages, totalram_pages / 2); 116} 117#endif 118 119static bool shmem_should_replace_page(struct page *page, gfp_t gfp); 120static int shmem_replace_page(struct page **pagep, gfp_t gfp, 121 struct shmem_inode_info *info, pgoff_t index); 122static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 123 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); 124 125static inline int shmem_getpage(struct inode *inode, pgoff_t index, 126 struct page **pagep, enum sgp_type sgp, int *fault_type) 127{ 128 return shmem_getpage_gfp(inode, index, pagep, sgp, 129 mapping_gfp_mask(inode->i_mapping), fault_type); 130} 131 132static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) 133{ 134 return sb->s_fs_info; 135} 136 137/* 138 * shmem_file_setup pre-accounts the whole fixed size of a VM object, 139 * for shared memory and for shared anonymous (/dev/zero) mappings 140 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), 141 * consistent with the pre-accounting of private mappings ... 142 */ 143static inline int shmem_acct_size(unsigned long flags, loff_t size) 144{ 145 return (flags & VM_NORESERVE) ? 146 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); 147} 148 149static inline void shmem_unacct_size(unsigned long flags, loff_t size) 150{ 151 if (!(flags & VM_NORESERVE)) 152 vm_unacct_memory(VM_ACCT(size)); 153} 154 155static inline int shmem_reacct_size(unsigned long flags, 156 loff_t oldsize, loff_t newsize) 157{ 158 if (!(flags & VM_NORESERVE)) { 159 if (VM_ACCT(newsize) > VM_ACCT(oldsize)) 160 return security_vm_enough_memory_mm(current->mm, 161 VM_ACCT(newsize) - VM_ACCT(oldsize)); 162 else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) 163 vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); 164 } 165 return 0; 166} 167 168/* 169 * ... whereas tmpfs objects are accounted incrementally as 170 * pages are allocated, in order to allow huge sparse files. 171 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, 172 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. 173 */ 174static inline int shmem_acct_block(unsigned long flags) 175{ 176 return (flags & VM_NORESERVE) ? 177 security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0; 178} 179 180static inline void shmem_unacct_blocks(unsigned long flags, long pages) 181{ 182 if (flags & VM_NORESERVE) 183 vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE)); 184} 185 186static const struct super_operations shmem_ops; 187static const struct address_space_operations shmem_aops; 188static const struct file_operations shmem_file_operations; 189static const struct inode_operations shmem_inode_operations; 190static const struct inode_operations shmem_dir_inode_operations; 191static const struct inode_operations shmem_special_inode_operations; 192static const struct vm_operations_struct shmem_vm_ops; 193 194static struct backing_dev_info shmem_backing_dev_info __read_mostly = { 195 .ra_pages = 0, /* No readahead */ 196 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 197}; 198 199static LIST_HEAD(shmem_swaplist); 200static DEFINE_MUTEX(shmem_swaplist_mutex); 201 202static int shmem_reserve_inode(struct super_block *sb) 203{ 204 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 205 if (sbinfo->max_inodes) { 206 spin_lock(&sbinfo->stat_lock); 207 if (!sbinfo->free_inodes) { 208 spin_unlock(&sbinfo->stat_lock); 209 return -ENOSPC; 210 } 211 sbinfo->free_inodes--; 212 spin_unlock(&sbinfo->stat_lock); 213 } 214 return 0; 215} 216 217static void shmem_free_inode(struct super_block *sb) 218{ 219 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 220 if (sbinfo->max_inodes) { 221 spin_lock(&sbinfo->stat_lock); 222 sbinfo->free_inodes++; 223 spin_unlock(&sbinfo->stat_lock); 224 } 225} 226 227/** 228 * shmem_recalc_inode - recalculate the block usage of an inode 229 * @inode: inode to recalc 230 * 231 * We have to calculate the free blocks since the mm can drop 232 * undirtied hole pages behind our back. 233 * 234 * But normally info->alloced == inode->i_mapping->nrpages + info->swapped 235 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) 236 * 237 * It has to be called with the spinlock held. 238 */ 239static void shmem_recalc_inode(struct inode *inode) 240{ 241 struct shmem_inode_info *info = SHMEM_I(inode); 242 long freed; 243 244 freed = info->alloced - info->swapped - inode->i_mapping->nrpages; 245 if (freed > 0) { 246 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 247 if (sbinfo->max_blocks) 248 percpu_counter_add(&sbinfo->used_blocks, -freed); 249 info->alloced -= freed; 250 inode->i_blocks -= freed * BLOCKS_PER_PAGE; 251 shmem_unacct_blocks(info->flags, freed); 252 } 253} 254 255/* 256 * Replace item expected in radix tree by a new item, while holding tree lock. 257 */ 258static int shmem_radix_tree_replace(struct address_space *mapping, 259 pgoff_t index, void *expected, void *replacement) 260{ 261 void **pslot; 262 void *item; 263 264 VM_BUG_ON(!expected); 265 VM_BUG_ON(!replacement); 266 pslot = radix_tree_lookup_slot(&mapping->page_tree, index); 267 if (!pslot) 268 return -ENOENT; 269 item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock); 270 if (item != expected) 271 return -ENOENT; 272 radix_tree_replace_slot(pslot, replacement); 273 return 0; 274} 275 276/* 277 * Sometimes, before we decide whether to proceed or to fail, we must check 278 * that an entry was not already brought back from swap by a racing thread. 279 * 280 * Checking page is not enough: by the time a SwapCache page is locked, it 281 * might be reused, and again be SwapCache, using the same swap as before. 282 */ 283static bool shmem_confirm_swap(struct address_space *mapping, 284 pgoff_t index, swp_entry_t swap) 285{ 286 void *item; 287 288 rcu_read_lock(); 289 item = radix_tree_lookup(&mapping->page_tree, index); 290 rcu_read_unlock(); 291 return item == swp_to_radix_entry(swap); 292} 293 294/* 295 * Like add_to_page_cache_locked, but error if expected item has gone. 296 */ 297static int shmem_add_to_page_cache(struct page *page, 298 struct address_space *mapping, 299 pgoff_t index, void *expected) 300{ 301 int error; 302 303 VM_BUG_ON_PAGE(!PageLocked(page), page); 304 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 305 306 page_cache_get(page); 307 page->mapping = mapping; 308 page->index = index; 309 310 spin_lock_irq(&mapping->tree_lock); 311 if (!expected) 312 error = radix_tree_insert(&mapping->page_tree, index, page); 313 else 314 error = shmem_radix_tree_replace(mapping, index, expected, 315 page); 316 if (!error) { 317 mapping->nrpages++; 318 __inc_zone_page_state(page, NR_FILE_PAGES); 319 __inc_zone_page_state(page, NR_SHMEM); 320 spin_unlock_irq(&mapping->tree_lock); 321 } else { 322 page->mapping = NULL; 323 spin_unlock_irq(&mapping->tree_lock); 324 page_cache_release(page); 325 } 326 return error; 327} 328 329/* 330 * Like delete_from_page_cache, but substitutes swap for page. 331 */ 332static void shmem_delete_from_page_cache(struct page *page, void *radswap) 333{ 334 struct address_space *mapping = page->mapping; 335 int error; 336 337 spin_lock_irq(&mapping->tree_lock); 338 error = shmem_radix_tree_replace(mapping, page->index, page, radswap); 339 page->mapping = NULL; 340 mapping->nrpages--; 341 __dec_zone_page_state(page, NR_FILE_PAGES); 342 __dec_zone_page_state(page, NR_SHMEM); 343 spin_unlock_irq(&mapping->tree_lock); 344 page_cache_release(page); 345 BUG_ON(error); 346} 347 348/* 349 * Remove swap entry from radix tree, free the swap and its page cache. 350 */ 351static int shmem_free_swap(struct address_space *mapping, 352 pgoff_t index, void *radswap) 353{ 354 void *old; 355 356 spin_lock_irq(&mapping->tree_lock); 357 old = radix_tree_delete_item(&mapping->page_tree, index, radswap); 358 spin_unlock_irq(&mapping->tree_lock); 359 if (old != radswap) 360 return -ENOENT; 361 free_swap_and_cache(radix_to_swp_entry(radswap)); 362 return 0; 363} 364 365/* 366 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. 367 */ 368void shmem_unlock_mapping(struct address_space *mapping) 369{ 370 struct pagevec pvec; 371 pgoff_t indices[PAGEVEC_SIZE]; 372 pgoff_t index = 0; 373 374 pagevec_init(&pvec, 0); 375 /* 376 * Minor point, but we might as well stop if someone else SHM_LOCKs it. 377 */ 378 while (!mapping_unevictable(mapping)) { 379 /* 380 * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it 381 * has finished, if it hits a row of PAGEVEC_SIZE swap entries. 382 */ 383 pvec.nr = find_get_entries(mapping, index, 384 PAGEVEC_SIZE, pvec.pages, indices); 385 if (!pvec.nr) 386 break; 387 index = indices[pvec.nr - 1] + 1; 388 pagevec_remove_exceptionals(&pvec); 389 check_move_unevictable_pages(pvec.pages, pvec.nr); 390 pagevec_release(&pvec); 391 cond_resched(); 392 } 393} 394 395/* 396 * Remove range of pages and swap entries from radix tree, and free them. 397 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. 398 */ 399static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, 400 bool unfalloc) 401{ 402 struct address_space *mapping = inode->i_mapping; 403 struct shmem_inode_info *info = SHMEM_I(inode); 404 pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 405 pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT; 406 unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1); 407 unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); 408 struct pagevec pvec; 409 pgoff_t indices[PAGEVEC_SIZE]; 410 long nr_swaps_freed = 0; 411 pgoff_t index; 412 int i; 413 414 if (lend == -1) 415 end = -1; /* unsigned, so actually very big */ 416 417 pagevec_init(&pvec, 0); 418 index = start; 419 while (index < end) { 420 pvec.nr = find_get_entries(mapping, index, 421 min(end - index, (pgoff_t)PAGEVEC_SIZE), 422 pvec.pages, indices); 423 if (!pvec.nr) 424 break; 425 for (i = 0; i < pagevec_count(&pvec); i++) { 426 struct page *page = pvec.pages[i]; 427 428 index = indices[i]; 429 if (index >= end) 430 break; 431 432 if (radix_tree_exceptional_entry(page)) { 433 if (unfalloc) 434 continue; 435 nr_swaps_freed += !shmem_free_swap(mapping, 436 index, page); 437 continue; 438 } 439 440 if (!trylock_page(page)) 441 continue; 442 if (!unfalloc || !PageUptodate(page)) { 443 if (page->mapping == mapping) { 444 VM_BUG_ON_PAGE(PageWriteback(page), page); 445 truncate_inode_page(mapping, page); 446 } 447 } 448 unlock_page(page); 449 } 450 pagevec_remove_exceptionals(&pvec); 451 pagevec_release(&pvec); 452 cond_resched(); 453 index++; 454 } 455 456 if (partial_start) { 457 struct page *page = NULL; 458 shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); 459 if (page) { 460 unsigned int top = PAGE_CACHE_SIZE; 461 if (start > end) { 462 top = partial_end; 463 partial_end = 0; 464 } 465 zero_user_segment(page, partial_start, top); 466 set_page_dirty(page); 467 unlock_page(page); 468 page_cache_release(page); 469 } 470 } 471 if (partial_end) { 472 struct page *page = NULL; 473 shmem_getpage(inode, end, &page, SGP_READ, NULL); 474 if (page) { 475 zero_user_segment(page, 0, partial_end); 476 set_page_dirty(page); 477 unlock_page(page); 478 page_cache_release(page); 479 } 480 } 481 if (start >= end) 482 return; 483 484 index = start; 485 while (index < end) { 486 cond_resched(); 487 488 pvec.nr = find_get_entries(mapping, index, 489 min(end - index, (pgoff_t)PAGEVEC_SIZE), 490 pvec.pages, indices); 491 if (!pvec.nr) { 492 /* If all gone or hole-punch or unfalloc, we're done */ 493 if (index == start || end != -1) 494 break; 495 /* But if truncating, restart to make sure all gone */ 496 index = start; 497 continue; 498 } 499 for (i = 0; i < pagevec_count(&pvec); i++) { 500 struct page *page = pvec.pages[i]; 501 502 index = indices[i]; 503 if (index >= end) 504 break; 505 506 if (radix_tree_exceptional_entry(page)) { 507 if (unfalloc) 508 continue; 509 if (shmem_free_swap(mapping, index, page)) { 510 /* Swap was replaced by page: retry */ 511 index--; 512 break; 513 } 514 nr_swaps_freed++; 515 continue; 516 } 517 518 lock_page(page); 519 if (!unfalloc || !PageUptodate(page)) { 520 if (page->mapping == mapping) { 521 VM_BUG_ON_PAGE(PageWriteback(page), page); 522 truncate_inode_page(mapping, page); 523 } else { 524 /* Page was replaced by swap: retry */ 525 unlock_page(page); 526 index--; 527 break; 528 } 529 } 530 unlock_page(page); 531 } 532 pagevec_remove_exceptionals(&pvec); 533 pagevec_release(&pvec); 534 index++; 535 } 536 537 spin_lock(&info->lock); 538 info->swapped -= nr_swaps_freed; 539 shmem_recalc_inode(inode); 540 spin_unlock(&info->lock); 541} 542 543void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 544{ 545 shmem_undo_range(inode, lstart, lend, false); 546 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 547} 548EXPORT_SYMBOL_GPL(shmem_truncate_range); 549 550static int shmem_setattr(struct dentry *dentry, struct iattr *attr) 551{ 552 struct inode *inode = dentry->d_inode; 553 struct shmem_inode_info *info = SHMEM_I(inode); 554 int error; 555 556 error = inode_change_ok(inode, attr); 557 if (error) 558 return error; 559 560 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 561 loff_t oldsize = inode->i_size; 562 loff_t newsize = attr->ia_size; 563 564 /* protected by i_mutex */ 565 if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || 566 (newsize > oldsize && (info->seals & F_SEAL_GROW))) 567 return -EPERM; 568 569 if (newsize != oldsize) { 570 error = shmem_reacct_size(SHMEM_I(inode)->flags, 571 oldsize, newsize); 572 if (error) 573 return error; 574 i_size_write(inode, newsize); 575 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 576 } 577 if (newsize < oldsize) { 578 loff_t holebegin = round_up(newsize, PAGE_SIZE); 579 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); 580 shmem_truncate_range(inode, newsize, (loff_t)-1); 581 /* unmap again to remove racily COWed private pages */ 582 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); 583 } 584 } 585 586 setattr_copy(inode, attr); 587 if (attr->ia_valid & ATTR_MODE) 588 error = posix_acl_chmod(inode, inode->i_mode); 589 return error; 590} 591 592static void shmem_evict_inode(struct inode *inode) 593{ 594 struct shmem_inode_info *info = SHMEM_I(inode); 595 596 if (inode->i_mapping->a_ops == &shmem_aops) { 597 shmem_unacct_size(info->flags, inode->i_size); 598 inode->i_size = 0; 599 shmem_truncate_range(inode, 0, (loff_t)-1); 600 if (!list_empty(&info->swaplist)) { 601 mutex_lock(&shmem_swaplist_mutex); 602 list_del_init(&info->swaplist); 603 mutex_unlock(&shmem_swaplist_mutex); 604 } 605 } else 606 kfree(info->symlink); 607 608 simple_xattrs_free(&info->xattrs); 609 WARN_ON(inode->i_blocks); 610 shmem_free_inode(inode->i_sb); 611 clear_inode(inode); 612} 613 614/* 615 * If swap found in inode, free it and move page from swapcache to filecache. 616 */ 617static int shmem_unuse_inode(struct shmem_inode_info *info, 618 swp_entry_t swap, struct page **pagep) 619{ 620 struct address_space *mapping = info->vfs_inode.i_mapping; 621 void *radswap; 622 pgoff_t index; 623 gfp_t gfp; 624 int error = 0; 625 626 radswap = swp_to_radix_entry(swap); 627 index = radix_tree_locate_item(&mapping->page_tree, radswap); 628 if (index == -1) 629 return -EAGAIN; /* tell shmem_unuse we found nothing */ 630 631 /* 632 * Move _head_ to start search for next from here. 633 * But be careful: shmem_evict_inode checks list_empty without taking 634 * mutex, and there's an instant in list_move_tail when info->swaplist 635 * would appear empty, if it were the only one on shmem_swaplist. 636 */ 637 if (shmem_swaplist.next != &info->swaplist) 638 list_move_tail(&shmem_swaplist, &info->swaplist); 639 640 gfp = mapping_gfp_mask(mapping); 641 if (shmem_should_replace_page(*pagep, gfp)) { 642 mutex_unlock(&shmem_swaplist_mutex); 643 error = shmem_replace_page(pagep, gfp, info, index); 644 mutex_lock(&shmem_swaplist_mutex); 645 /* 646 * We needed to drop mutex to make that restrictive page 647 * allocation, but the inode might have been freed while we 648 * dropped it: although a racing shmem_evict_inode() cannot 649 * complete without emptying the radix_tree, our page lock 650 * on this swapcache page is not enough to prevent that - 651 * free_swap_and_cache() of our swap entry will only 652 * trylock_page(), removing swap from radix_tree whatever. 653 * 654 * We must not proceed to shmem_add_to_page_cache() if the 655 * inode has been freed, but of course we cannot rely on 656 * inode or mapping or info to check that. However, we can 657 * safely check if our swap entry is still in use (and here 658 * it can't have got reused for another page): if it's still 659 * in use, then the inode cannot have been freed yet, and we 660 * can safely proceed (if it's no longer in use, that tells 661 * nothing about the inode, but we don't need to unuse swap). 662 */ 663 if (!page_swapcount(*pagep)) 664 error = -ENOENT; 665 } 666 667 /* 668 * We rely on shmem_swaplist_mutex, not only to protect the swaplist, 669 * but also to hold up shmem_evict_inode(): so inode cannot be freed 670 * beneath us (pagelock doesn't help until the page is in pagecache). 671 */ 672 if (!error) 673 error = shmem_add_to_page_cache(*pagep, mapping, index, 674 radswap); 675 if (error != -ENOMEM) { 676 /* 677 * Truncation and eviction use free_swap_and_cache(), which 678 * only does trylock page: if we raced, best clean up here. 679 */ 680 delete_from_swap_cache(*pagep); 681 set_page_dirty(*pagep); 682 if (!error) { 683 spin_lock(&info->lock); 684 info->swapped--; 685 spin_unlock(&info->lock); 686 swap_free(swap); 687 } 688 } 689 return error; 690} 691 692/* 693 * Search through swapped inodes to find and replace swap by page. 694 */ 695int shmem_unuse(swp_entry_t swap, struct page *page) 696{ 697 struct list_head *this, *next; 698 struct shmem_inode_info *info; 699 struct mem_cgroup *memcg; 700 int error = 0; 701 702 /* 703 * There's a faint possibility that swap page was replaced before 704 * caller locked it: caller will come back later with the right page. 705 */ 706 if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) 707 goto out; 708 709 /* 710 * Charge page using GFP_KERNEL while we can wait, before taking 711 * the shmem_swaplist_mutex which might hold up shmem_writepage(). 712 * Charged back to the user (not to caller) when swap account is used. 713 */ 714 error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg); 715 if (error) 716 goto out; 717 /* No radix_tree_preload: swap entry keeps a place for page in tree */ 718 error = -EAGAIN; 719 720 mutex_lock(&shmem_swaplist_mutex); 721 list_for_each_safe(this, next, &shmem_swaplist) { 722 info = list_entry(this, struct shmem_inode_info, swaplist); 723 if (info->swapped) 724 error = shmem_unuse_inode(info, swap, &page); 725 else 726 list_del_init(&info->swaplist); 727 cond_resched(); 728 if (error != -EAGAIN) 729 break; 730 /* found nothing in this: move on to search the next */ 731 } 732 mutex_unlock(&shmem_swaplist_mutex); 733 734 if (error) { 735 if (error != -ENOMEM) 736 error = 0; 737 mem_cgroup_cancel_charge(page, memcg); 738 } else 739 mem_cgroup_commit_charge(page, memcg, true); 740out: 741 unlock_page(page); 742 page_cache_release(page); 743 return error; 744} 745 746/* 747 * Move the page from the page cache to the swap cache. 748 */ 749static int shmem_writepage(struct page *page, struct writeback_control *wbc) 750{ 751 struct shmem_inode_info *info; 752 struct address_space *mapping; 753 struct inode *inode; 754 swp_entry_t swap; 755 pgoff_t index; 756 757 BUG_ON(!PageLocked(page)); 758 mapping = page->mapping; 759 index = page->index; 760 inode = mapping->host; 761 info = SHMEM_I(inode); 762 if (info->flags & VM_LOCKED) 763 goto redirty; 764 if (!total_swap_pages) 765 goto redirty; 766 767 /* 768 * shmem_backing_dev_info's capabilities prevent regular writeback or 769 * sync from ever calling shmem_writepage; but a stacking filesystem 770 * might use ->writepage of its underlying filesystem, in which case 771 * tmpfs should write out to swap only in response to memory pressure, 772 * and not for the writeback threads or sync. 773 */ 774 if (!wbc->for_reclaim) { 775 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ 776 goto redirty; 777 } 778 779 /* 780 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC 781 * value into swapfile.c, the only way we can correctly account for a 782 * fallocated page arriving here is now to initialize it and write it. 783 * 784 * That's okay for a page already fallocated earlier, but if we have 785 * not yet completed the fallocation, then (a) we want to keep track 786 * of this page in case we have to undo it, and (b) it may not be a 787 * good idea to continue anyway, once we're pushing into swap. So 788 * reactivate the page, and let shmem_fallocate() quit when too many. 789 */ 790 if (!PageUptodate(page)) { 791 if (inode->i_private) { 792 struct shmem_falloc *shmem_falloc; 793 spin_lock(&inode->i_lock); 794 shmem_falloc = inode->i_private; 795 if (shmem_falloc && 796 !shmem_falloc->waitq && 797 index >= shmem_falloc->start && 798 index < shmem_falloc->next) 799 shmem_falloc->nr_unswapped++; 800 else 801 shmem_falloc = NULL; 802 spin_unlock(&inode->i_lock); 803 if (shmem_falloc) 804 goto redirty; 805 } 806 clear_highpage(page); 807 flush_dcache_page(page); 808 SetPageUptodate(page); 809 } 810 811 swap = get_swap_page(); 812 if (!swap.val) 813 goto redirty; 814 815 /* 816 * Add inode to shmem_unuse()'s list of swapped-out inodes, 817 * if it's not already there. Do it now before the page is 818 * moved to swap cache, when its pagelock no longer protects 819 * the inode from eviction. But don't unlock the mutex until 820 * we've incremented swapped, because shmem_unuse_inode() will 821 * prune a !swapped inode from the swaplist under this mutex. 822 */ 823 mutex_lock(&shmem_swaplist_mutex); 824 if (list_empty(&info->swaplist)) 825 list_add_tail(&info->swaplist, &shmem_swaplist); 826 827 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 828 swap_shmem_alloc(swap); 829 shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); 830 831 spin_lock(&info->lock); 832 info->swapped++; 833 shmem_recalc_inode(inode); 834 spin_unlock(&info->lock); 835 836 mutex_unlock(&shmem_swaplist_mutex); 837 BUG_ON(page_mapped(page)); 838 swap_writepage(page, wbc); 839 return 0; 840 } 841 842 mutex_unlock(&shmem_swaplist_mutex); 843 swapcache_free(swap); 844redirty: 845 set_page_dirty(page); 846 if (wbc->for_reclaim) 847 return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */ 848 unlock_page(page); 849 return 0; 850} 851 852#ifdef CONFIG_NUMA 853#ifdef CONFIG_TMPFS 854static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 855{ 856 char buffer[64]; 857 858 if (!mpol || mpol->mode == MPOL_DEFAULT) 859 return; /* show nothing */ 860 861 mpol_to_str(buffer, sizeof(buffer), mpol); 862 863 seq_printf(seq, ",mpol=%s", buffer); 864} 865 866static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 867{ 868 struct mempolicy *mpol = NULL; 869 if (sbinfo->mpol) { 870 spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ 871 mpol = sbinfo->mpol; 872 mpol_get(mpol); 873 spin_unlock(&sbinfo->stat_lock); 874 } 875 return mpol; 876} 877#endif /* CONFIG_TMPFS */ 878 879static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, 880 struct shmem_inode_info *info, pgoff_t index) 881{ 882 struct vm_area_struct pvma; 883 struct page *page; 884 885 /* Create a pseudo vma that just contains the policy */ 886 pvma.vm_start = 0; 887 /* Bias interleave by inode number to distribute better across nodes */ 888 pvma.vm_pgoff = index + info->vfs_inode.i_ino; 889 pvma.vm_ops = NULL; 890 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); 891 892 page = swapin_readahead(swap, gfp, &pvma, 0); 893 894 /* Drop reference taken by mpol_shared_policy_lookup() */ 895 mpol_cond_put(pvma.vm_policy); 896 897 return page; 898} 899 900static struct page *shmem_alloc_page(gfp_t gfp, 901 struct shmem_inode_info *info, pgoff_t index) 902{ 903 struct vm_area_struct pvma; 904 struct page *page; 905 906 /* Create a pseudo vma that just contains the policy */ 907 pvma.vm_start = 0; 908 /* Bias interleave by inode number to distribute better across nodes */ 909 pvma.vm_pgoff = index + info->vfs_inode.i_ino; 910 pvma.vm_ops = NULL; 911 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); 912 913 page = alloc_page_vma(gfp, &pvma, 0); 914 915 /* Drop reference taken by mpol_shared_policy_lookup() */ 916 mpol_cond_put(pvma.vm_policy); 917 918 return page; 919} 920#else /* !CONFIG_NUMA */ 921#ifdef CONFIG_TMPFS 922static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 923{ 924} 925#endif /* CONFIG_TMPFS */ 926 927static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, 928 struct shmem_inode_info *info, pgoff_t index) 929{ 930 return swapin_readahead(swap, gfp, NULL, 0); 931} 932 933static inline struct page *shmem_alloc_page(gfp_t gfp, 934 struct shmem_inode_info *info, pgoff_t index) 935{ 936 return alloc_page(gfp); 937} 938#endif /* CONFIG_NUMA */ 939 940#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS) 941static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 942{ 943 return NULL; 944} 945#endif 946 947/* 948 * When a page is moved from swapcache to shmem filecache (either by the 949 * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of 950 * shmem_unuse_inode()), it may have been read in earlier from swap, in 951 * ignorance of the mapping it belongs to. If that mapping has special 952 * constraints (like the gma500 GEM driver, which requires RAM below 4GB), 953 * we may need to copy to a suitable page before moving to filecache. 954 * 955 * In a future release, this may well be extended to respect cpuset and 956 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); 957 * but for now it is a simple matter of zone. 958 */ 959static bool shmem_should_replace_page(struct page *page, gfp_t gfp) 960{ 961 return page_zonenum(page) > gfp_zone(gfp); 962} 963 964static int shmem_replace_page(struct page **pagep, gfp_t gfp, 965 struct shmem_inode_info *info, pgoff_t index) 966{ 967 struct page *oldpage, *newpage; 968 struct address_space *swap_mapping; 969 pgoff_t swap_index; 970 int error; 971 972 oldpage = *pagep; 973 swap_index = page_private(oldpage); 974 swap_mapping = page_mapping(oldpage); 975 976 /* 977 * We have arrived here because our zones are constrained, so don't 978 * limit chance of success by further cpuset and node constraints. 979 */ 980 gfp &= ~GFP_CONSTRAINT_MASK; 981 newpage = shmem_alloc_page(gfp, info, index); 982 if (!newpage) 983 return -ENOMEM; 984 985 page_cache_get(newpage); 986 copy_highpage(newpage, oldpage); 987 flush_dcache_page(newpage); 988 989 __set_page_locked(newpage); 990 SetPageUptodate(newpage); 991 SetPageSwapBacked(newpage); 992 set_page_private(newpage, swap_index); 993 SetPageSwapCache(newpage); 994 995 /* 996 * Our caller will very soon move newpage out of swapcache, but it's 997 * a nice clean interface for us to replace oldpage by newpage there. 998 */ 999 spin_lock_irq(&swap_mapping->tree_lock); 1000 error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, 1001 newpage); 1002 if (!error) { 1003 __inc_zone_page_state(newpage, NR_FILE_PAGES); 1004 __dec_zone_page_state(oldpage, NR_FILE_PAGES); 1005 } 1006 spin_unlock_irq(&swap_mapping->tree_lock); 1007 1008 if (unlikely(error)) { 1009 /* 1010 * Is this possible? I think not, now that our callers check 1011 * both PageSwapCache and page_private after getting page lock; 1012 * but be defensive. Reverse old to newpage for clear and free. 1013 */ 1014 oldpage = newpage; 1015 } else { 1016 mem_cgroup_migrate(oldpage, newpage, false); 1017 lru_cache_add_anon(newpage); 1018 *pagep = newpage; 1019 } 1020 1021 ClearPageSwapCache(oldpage); 1022 set_page_private(oldpage, 0); 1023 1024 unlock_page(oldpage); 1025 page_cache_release(oldpage); 1026 page_cache_release(oldpage); 1027 return error; 1028} 1029 1030/* 1031 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate 1032 * 1033 * If we allocate a new one we do not mark it dirty. That's up to the 1034 * vm. If we swap it in we mark it dirty since we also free the swap 1035 * entry since a page cannot live in both the swap and page cache 1036 */ 1037static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 1038 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) 1039{ 1040 struct address_space *mapping = inode->i_mapping; 1041 struct shmem_inode_info *info; 1042 struct shmem_sb_info *sbinfo; 1043 struct mem_cgroup *memcg; 1044 struct page *page; 1045 swp_entry_t swap; 1046 int error; 1047 int once = 0; 1048 int alloced = 0; 1049 1050 if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) 1051 return -EFBIG; 1052repeat: 1053 swap.val = 0; 1054 page = find_lock_entry(mapping, index); 1055 if (radix_tree_exceptional_entry(page)) { 1056 swap = radix_to_swp_entry(page); 1057 page = NULL; 1058 } 1059 1060 if (sgp != SGP_WRITE && sgp != SGP_FALLOC && 1061 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { 1062 error = -EINVAL; 1063 goto failed; 1064 } 1065 1066 if (page && sgp == SGP_WRITE) 1067 mark_page_accessed(page); 1068 1069 /* fallocated page? */ 1070 if (page && !PageUptodate(page)) { 1071 if (sgp != SGP_READ) 1072 goto clear; 1073 unlock_page(page); 1074 page_cache_release(page); 1075 page = NULL; 1076 } 1077 if (page || (sgp == SGP_READ && !swap.val)) { 1078 *pagep = page; 1079 return 0; 1080 } 1081 1082 /* 1083 * Fast cache lookup did not find it: 1084 * bring it back from swap or allocate. 1085 */ 1086 info = SHMEM_I(inode); 1087 sbinfo = SHMEM_SB(inode->i_sb); 1088 1089 if (swap.val) { 1090 /* Look it up and read it in.. */ 1091 page = lookup_swap_cache(swap); 1092 if (!page) { 1093 /* here we actually do the io */ 1094 if (fault_type) 1095 *fault_type |= VM_FAULT_MAJOR; 1096 page = shmem_swapin(swap, gfp, info, index); 1097 if (!page) { 1098 error = -ENOMEM; 1099 goto failed; 1100 } 1101 } 1102 1103 /* We have to do this with page locked to prevent races */ 1104 lock_page(page); 1105 if (!PageSwapCache(page) || page_private(page) != swap.val || 1106 !shmem_confirm_swap(mapping, index, swap)) { 1107 error = -EEXIST; /* try again */ 1108 goto unlock; 1109 } 1110 if (!PageUptodate(page)) { 1111 error = -EIO; 1112 goto failed; 1113 } 1114 wait_on_page_writeback(page); 1115 1116 if (shmem_should_replace_page(page, gfp)) { 1117 error = shmem_replace_page(&page, gfp, info, index); 1118 if (error) 1119 goto failed; 1120 } 1121 1122 error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); 1123 if (!error) { 1124 error = shmem_add_to_page_cache(page, mapping, index, 1125 swp_to_radix_entry(swap)); 1126 /* 1127 * We already confirmed swap under page lock, and make 1128 * no memory allocation here, so usually no possibility 1129 * of error; but free_swap_and_cache() only trylocks a 1130 * page, so it is just possible that the entry has been 1131 * truncated or holepunched since swap was confirmed. 1132 * shmem_undo_range() will have done some of the 1133 * unaccounting, now delete_from_swap_cache() will do 1134 * the rest (including mem_cgroup_uncharge_swapcache). 1135 * Reset swap.val? No, leave it so "failed" goes back to 1136 * "repeat": reading a hole and writing should succeed. 1137 */ 1138 if (error) { 1139 mem_cgroup_cancel_charge(page, memcg); 1140 delete_from_swap_cache(page); 1141 } 1142 } 1143 if (error) 1144 goto failed; 1145 1146 mem_cgroup_commit_charge(page, memcg, true); 1147 1148 spin_lock(&info->lock); 1149 info->swapped--; 1150 shmem_recalc_inode(inode); 1151 spin_unlock(&info->lock); 1152 1153 if (sgp == SGP_WRITE) 1154 mark_page_accessed(page); 1155 1156 delete_from_swap_cache(page); 1157 set_page_dirty(page); 1158 swap_free(swap); 1159 1160 } else { 1161 if (shmem_acct_block(info->flags)) { 1162 error = -ENOSPC; 1163 goto failed; 1164 } 1165 if (sbinfo->max_blocks) { 1166 if (percpu_counter_compare(&sbinfo->used_blocks, 1167 sbinfo->max_blocks) >= 0) { 1168 error = -ENOSPC; 1169 goto unacct; 1170 } 1171 percpu_counter_inc(&sbinfo->used_blocks); 1172 } 1173 1174 page = shmem_alloc_page(gfp, info, index); 1175 if (!page) { 1176 error = -ENOMEM; 1177 goto decused; 1178 } 1179 1180 __SetPageSwapBacked(page); 1181 __set_page_locked(page); 1182 if (sgp == SGP_WRITE) 1183 __SetPageReferenced(page); 1184 1185 error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); 1186 if (error) 1187 goto decused; 1188 error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); 1189 if (!error) { 1190 error = shmem_add_to_page_cache(page, mapping, index, 1191 NULL); 1192 radix_tree_preload_end(); 1193 } 1194 if (error) { 1195 mem_cgroup_cancel_charge(page, memcg); 1196 goto decused; 1197 } 1198 mem_cgroup_commit_charge(page, memcg, false); 1199 lru_cache_add_anon(page); 1200 1201 spin_lock(&info->lock); 1202 info->alloced++; 1203 inode->i_blocks += BLOCKS_PER_PAGE; 1204 shmem_recalc_inode(inode); 1205 spin_unlock(&info->lock); 1206 alloced = true; 1207 1208 /* 1209 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. 1210 */ 1211 if (sgp == SGP_FALLOC) 1212 sgp = SGP_WRITE; 1213clear: 1214 /* 1215 * Let SGP_WRITE caller clear ends if write does not fill page; 1216 * but SGP_FALLOC on a page fallocated earlier must initialize 1217 * it now, lest undo on failure cancel our earlier guarantee. 1218 */ 1219 if (sgp != SGP_WRITE) { 1220 clear_highpage(page); 1221 flush_dcache_page(page); 1222 SetPageUptodate(page); 1223 } 1224 if (sgp == SGP_DIRTY) 1225 set_page_dirty(page); 1226 } 1227 1228 /* Perhaps the file has been truncated since we checked */ 1229 if (sgp != SGP_WRITE && sgp != SGP_FALLOC && 1230 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { 1231 error = -EINVAL; 1232 if (alloced) 1233 goto trunc; 1234 else 1235 goto failed; 1236 } 1237 *pagep = page; 1238 return 0; 1239 1240 /* 1241 * Error recovery. 1242 */ 1243trunc: 1244 info = SHMEM_I(inode); 1245 ClearPageDirty(page); 1246 delete_from_page_cache(page); 1247 spin_lock(&info->lock); 1248 info->alloced--; 1249 inode->i_blocks -= BLOCKS_PER_PAGE; 1250 spin_unlock(&info->lock); 1251decused: 1252 sbinfo = SHMEM_SB(inode->i_sb); 1253 if (sbinfo->max_blocks) 1254 percpu_counter_add(&sbinfo->used_blocks, -1); 1255unacct: 1256 shmem_unacct_blocks(info->flags, 1); 1257failed: 1258 if (swap.val && error != -EINVAL && 1259 !shmem_confirm_swap(mapping, index, swap)) 1260 error = -EEXIST; 1261unlock: 1262 if (page) { 1263 unlock_page(page); 1264 page_cache_release(page); 1265 } 1266 if (error == -ENOSPC && !once++) { 1267 info = SHMEM_I(inode); 1268 spin_lock(&info->lock); 1269 shmem_recalc_inode(inode); 1270 spin_unlock(&info->lock); 1271 goto repeat; 1272 } 1273 if (error == -EEXIST) /* from above or from radix_tree_insert */ 1274 goto repeat; 1275 return error; 1276} 1277 1278static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1279{ 1280 struct inode *inode = file_inode(vma->vm_file); 1281 int error; 1282 int ret = VM_FAULT_LOCKED; 1283 1284 /* 1285 * Trinity finds that probing a hole which tmpfs is punching can 1286 * prevent the hole-punch from ever completing: which in turn 1287 * locks writers out with its hold on i_mutex. So refrain from 1288 * faulting pages into the hole while it's being punched. Although 1289 * shmem_undo_range() does remove the additions, it may be unable to 1290 * keep up, as each new page needs its own unmap_mapping_range() call, 1291 * and the i_mmap tree grows ever slower to scan if new vmas are added. 1292 * 1293 * It does not matter if we sometimes reach this check just before the 1294 * hole-punch begins, so that one fault then races with the punch: 1295 * we just need to make racing faults a rare case. 1296 * 1297 * The implementation below would be much simpler if we just used a 1298 * standard mutex or completion: but we cannot take i_mutex in fault, 1299 * and bloating every shmem inode for this unlikely case would be sad. 1300 */ 1301 if (unlikely(inode->i_private)) { 1302 struct shmem_falloc *shmem_falloc; 1303 1304 spin_lock(&inode->i_lock); 1305 shmem_falloc = inode->i_private; 1306 if (shmem_falloc && 1307 shmem_falloc->waitq && 1308 vmf->pgoff >= shmem_falloc->start && 1309 vmf->pgoff < shmem_falloc->next) { 1310 wait_queue_head_t *shmem_falloc_waitq; 1311 DEFINE_WAIT(shmem_fault_wait); 1312 1313 ret = VM_FAULT_NOPAGE; 1314 if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && 1315 !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { 1316 /* It's polite to up mmap_sem if we can */ 1317 up_read(&vma->vm_mm->mmap_sem); 1318 ret = VM_FAULT_RETRY; 1319 } 1320 1321 shmem_falloc_waitq = shmem_falloc->waitq; 1322 prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, 1323 TASK_UNINTERRUPTIBLE); 1324 spin_unlock(&inode->i_lock); 1325 schedule(); 1326 1327 /* 1328 * shmem_falloc_waitq points into the shmem_fallocate() 1329 * stack of the hole-punching task: shmem_falloc_waitq 1330 * is usually invalid by the time we reach here, but 1331 * finish_wait() does not dereference it in that case; 1332 * though i_lock needed lest racing with wake_up_all(). 1333 */ 1334 spin_lock(&inode->i_lock); 1335 finish_wait(shmem_falloc_waitq, &shmem_fault_wait); 1336 spin_unlock(&inode->i_lock); 1337 return ret; 1338 } 1339 spin_unlock(&inode->i_lock); 1340 } 1341 1342 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); 1343 if (error) 1344 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1345 1346 if (ret & VM_FAULT_MAJOR) { 1347 count_vm_event(PGMAJFAULT); 1348 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 1349 } 1350 return ret; 1351} 1352 1353#ifdef CONFIG_NUMA 1354static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) 1355{ 1356 struct inode *inode = file_inode(vma->vm_file); 1357 return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); 1358} 1359 1360static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, 1361 unsigned long addr) 1362{ 1363 struct inode *inode = file_inode(vma->vm_file); 1364 pgoff_t index; 1365 1366 index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 1367 return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); 1368} 1369#endif 1370 1371int shmem_lock(struct file *file, int lock, struct user_struct *user) 1372{ 1373 struct inode *inode = file_inode(file); 1374 struct shmem_inode_info *info = SHMEM_I(inode); 1375 int retval = -ENOMEM; 1376 1377 spin_lock(&info->lock); 1378 if (lock && !(info->flags & VM_LOCKED)) { 1379 if (!user_shm_lock(inode->i_size, user)) 1380 goto out_nomem; 1381 info->flags |= VM_LOCKED; 1382 mapping_set_unevictable(file->f_mapping); 1383 } 1384 if (!lock && (info->flags & VM_LOCKED) && user) { 1385 user_shm_unlock(inode->i_size, user); 1386 info->flags &= ~VM_LOCKED; 1387 mapping_clear_unevictable(file->f_mapping); 1388 } 1389 retval = 0; 1390 1391out_nomem: 1392 spin_unlock(&info->lock); 1393 return retval; 1394} 1395 1396static int shmem_mmap(struct file *file, struct vm_area_struct *vma) 1397{ 1398 file_accessed(file); 1399 vma->vm_ops = &shmem_vm_ops; 1400 return 0; 1401} 1402 1403static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, 1404 umode_t mode, dev_t dev, unsigned long flags) 1405{ 1406 struct inode *inode; 1407 struct shmem_inode_info *info; 1408 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 1409 1410 if (shmem_reserve_inode(sb)) 1411 return NULL; 1412 1413 inode = new_inode(sb); 1414 if (inode) { 1415 inode->i_ino = get_next_ino(); 1416 inode_init_owner(inode, dir, mode); 1417 inode->i_blocks = 0; 1418 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; 1419 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1420 inode->i_generation = get_seconds(); 1421 info = SHMEM_I(inode); 1422 memset(info, 0, (char *)inode - (char *)info); 1423 spin_lock_init(&info->lock); 1424 info->seals = F_SEAL_SEAL; 1425 info->flags = flags & VM_NORESERVE; 1426 INIT_LIST_HEAD(&info->swaplist); 1427 simple_xattrs_init(&info->xattrs); 1428 cache_no_acl(inode); 1429 1430 switch (mode & S_IFMT) { 1431 default: 1432 inode->i_op = &shmem_special_inode_operations; 1433 init_special_inode(inode, mode, dev); 1434 break; 1435 case S_IFREG: 1436 inode->i_mapping->a_ops = &shmem_aops; 1437 inode->i_op = &shmem_inode_operations; 1438 inode->i_fop = &shmem_file_operations; 1439 mpol_shared_policy_init(&info->policy, 1440 shmem_get_sbmpol(sbinfo)); 1441 break; 1442 case S_IFDIR: 1443 inc_nlink(inode); 1444 /* Some things misbehave if size == 0 on a directory */ 1445 inode->i_size = 2 * BOGO_DIRENT_SIZE; 1446 inode->i_op = &shmem_dir_inode_operations; 1447 inode->i_fop = &simple_dir_operations; 1448 break; 1449 case S_IFLNK: 1450 /* 1451 * Must not load anything in the rbtree, 1452 * mpol_free_shared_policy will not be called. 1453 */ 1454 mpol_shared_policy_init(&info->policy, NULL); 1455 break; 1456 } 1457 } else 1458 shmem_free_inode(sb); 1459 return inode; 1460} 1461 1462bool shmem_mapping(struct address_space *mapping) 1463{ 1464 return mapping->backing_dev_info == &shmem_backing_dev_info; 1465} 1466 1467#ifdef CONFIG_TMPFS 1468static const struct inode_operations shmem_symlink_inode_operations; 1469static const struct inode_operations shmem_short_symlink_operations; 1470 1471#ifdef CONFIG_TMPFS_XATTR 1472static int shmem_initxattrs(struct inode *, const struct xattr *, void *); 1473#else 1474#define shmem_initxattrs NULL 1475#endif 1476 1477static int 1478shmem_write_begin(struct file *file, struct address_space *mapping, 1479 loff_t pos, unsigned len, unsigned flags, 1480 struct page **pagep, void **fsdata) 1481{ 1482 struct inode *inode = mapping->host; 1483 struct shmem_inode_info *info = SHMEM_I(inode); 1484 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1485 1486 /* i_mutex is held by caller */ 1487 if (unlikely(info->seals)) { 1488 if (info->seals & F_SEAL_WRITE) 1489 return -EPERM; 1490 if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) 1491 return -EPERM; 1492 } 1493 1494 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); 1495} 1496 1497static int 1498shmem_write_end(struct file *file, struct address_space *mapping, 1499 loff_t pos, unsigned len, unsigned copied, 1500 struct page *page, void *fsdata) 1501{ 1502 struct inode *inode = mapping->host; 1503 1504 if (pos + copied > inode->i_size) 1505 i_size_write(inode, pos + copied); 1506 1507 if (!PageUptodate(page)) { 1508 if (copied < PAGE_CACHE_SIZE) { 1509 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 1510 zero_user_segments(page, 0, from, 1511 from + copied, PAGE_CACHE_SIZE); 1512 } 1513 SetPageUptodate(page); 1514 } 1515 set_page_dirty(page); 1516 unlock_page(page); 1517 page_cache_release(page); 1518 1519 return copied; 1520} 1521 1522static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 1523{ 1524 struct file *file = iocb->ki_filp; 1525 struct inode *inode = file_inode(file); 1526 struct address_space *mapping = inode->i_mapping; 1527 pgoff_t index; 1528 unsigned long offset; 1529 enum sgp_type sgp = SGP_READ; 1530 int error = 0; 1531 ssize_t retval = 0; 1532 loff_t *ppos = &iocb->ki_pos; 1533 1534 /* 1535 * Might this read be for a stacking filesystem? Then when reading 1536 * holes of a sparse file, we actually need to allocate those pages, 1537 * and even mark them dirty, so it cannot exceed the max_blocks limit. 1538 */ 1539 if (segment_eq(get_fs(), KERNEL_DS)) 1540 sgp = SGP_DIRTY; 1541 1542 index = *ppos >> PAGE_CACHE_SHIFT; 1543 offset = *ppos & ~PAGE_CACHE_MASK; 1544 1545 for (;;) { 1546 struct page *page = NULL; 1547 pgoff_t end_index; 1548 unsigned long nr, ret; 1549 loff_t i_size = i_size_read(inode); 1550 1551 end_index = i_size >> PAGE_CACHE_SHIFT; 1552 if (index > end_index) 1553 break; 1554 if (index == end_index) { 1555 nr = i_size & ~PAGE_CACHE_MASK; 1556 if (nr <= offset) 1557 break; 1558 } 1559 1560 error = shmem_getpage(inode, index, &page, sgp, NULL); 1561 if (error) { 1562 if (error == -EINVAL) 1563 error = 0; 1564 break; 1565 } 1566 if (page) 1567 unlock_page(page); 1568 1569 /* 1570 * We must evaluate after, since reads (unlike writes) 1571 * are called without i_mutex protection against truncate 1572 */ 1573 nr = PAGE_CACHE_SIZE; 1574 i_size = i_size_read(inode); 1575 end_index = i_size >> PAGE_CACHE_SHIFT; 1576 if (index == end_index) { 1577 nr = i_size & ~PAGE_CACHE_MASK; 1578 if (nr <= offset) { 1579 if (page) 1580 page_cache_release(page); 1581 break; 1582 } 1583 } 1584 nr -= offset; 1585 1586 if (page) { 1587 /* 1588 * If users can be writing to this page using arbitrary 1589 * virtual addresses, take care about potential aliasing 1590 * before reading the page on the kernel side. 1591 */ 1592 if (mapping_writably_mapped(mapping)) 1593 flush_dcache_page(page); 1594 /* 1595 * Mark the page accessed if we read the beginning. 1596 */ 1597 if (!offset) 1598 mark_page_accessed(page); 1599 } else { 1600 page = ZERO_PAGE(0); 1601 page_cache_get(page); 1602 } 1603 1604 /* 1605 * Ok, we have the page, and it's up-to-date, so 1606 * now we can copy it to user space... 1607 */ 1608 ret = copy_page_to_iter(page, offset, nr, to); 1609 retval += ret; 1610 offset += ret; 1611 index += offset >> PAGE_CACHE_SHIFT; 1612 offset &= ~PAGE_CACHE_MASK; 1613 1614 page_cache_release(page); 1615 if (!iov_iter_count(to)) 1616 break; 1617 if (ret < nr) { 1618 error = -EFAULT; 1619 break; 1620 } 1621 cond_resched(); 1622 } 1623 1624 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; 1625 file_accessed(file); 1626 return retval ? retval : error; 1627} 1628 1629static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, 1630 struct pipe_inode_info *pipe, size_t len, 1631 unsigned int flags) 1632{ 1633 struct address_space *mapping = in->f_mapping; 1634 struct inode *inode = mapping->host; 1635 unsigned int loff, nr_pages, req_pages; 1636 struct page *pages[PIPE_DEF_BUFFERS]; 1637 struct partial_page partial[PIPE_DEF_BUFFERS]; 1638 struct page *page; 1639 pgoff_t index, end_index; 1640 loff_t isize, left; 1641 int error, page_nr; 1642 struct splice_pipe_desc spd = { 1643 .pages = pages, 1644 .partial = partial, 1645 .nr_pages_max = PIPE_DEF_BUFFERS, 1646 .flags = flags, 1647 .ops = &page_cache_pipe_buf_ops, 1648 .spd_release = spd_release_page, 1649 }; 1650 1651 isize = i_size_read(inode); 1652 if (unlikely(*ppos >= isize)) 1653 return 0; 1654 1655 left = isize - *ppos; 1656 if (unlikely(left < len)) 1657 len = left; 1658 1659 if (splice_grow_spd(pipe, &spd)) 1660 return -ENOMEM; 1661 1662 index = *ppos >> PAGE_CACHE_SHIFT; 1663 loff = *ppos & ~PAGE_CACHE_MASK; 1664 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1665 nr_pages = min(req_pages, spd.nr_pages_max); 1666 1667 spd.nr_pages = find_get_pages_contig(mapping, index, 1668 nr_pages, spd.pages); 1669 index += spd.nr_pages; 1670 error = 0; 1671 1672 while (spd.nr_pages < nr_pages) { 1673 error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); 1674 if (error) 1675 break; 1676 unlock_page(page); 1677 spd.pages[spd.nr_pages++] = page; 1678 index++; 1679 } 1680 1681 index = *ppos >> PAGE_CACHE_SHIFT; 1682 nr_pages = spd.nr_pages; 1683 spd.nr_pages = 0; 1684 1685 for (page_nr = 0; page_nr < nr_pages; page_nr++) { 1686 unsigned int this_len; 1687 1688 if (!len) 1689 break; 1690 1691 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 1692 page = spd.pages[page_nr]; 1693 1694 if (!PageUptodate(page) || page->mapping != mapping) { 1695 error = shmem_getpage(inode, index, &page, 1696 SGP_CACHE, NULL); 1697 if (error) 1698 break; 1699 unlock_page(page); 1700 page_cache_release(spd.pages[page_nr]); 1701 spd.pages[page_nr] = page; 1702 } 1703 1704 isize = i_size_read(inode); 1705 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 1706 if (unlikely(!isize || index > end_index)) 1707 break; 1708 1709 if (end_index == index) { 1710 unsigned int plen; 1711 1712 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; 1713 if (plen <= loff) 1714 break; 1715 1716 this_len = min(this_len, plen - loff); 1717 len = this_len; 1718 } 1719 1720 spd.partial[page_nr].offset = loff; 1721 spd.partial[page_nr].len = this_len; 1722 len -= this_len; 1723 loff = 0; 1724 spd.nr_pages++; 1725 index++; 1726 } 1727 1728 while (page_nr < nr_pages) 1729 page_cache_release(spd.pages[page_nr++]); 1730 1731 if (spd.nr_pages) 1732 error = splice_to_pipe(pipe, &spd); 1733 1734 splice_shrink_spd(&spd); 1735 1736 if (error > 0) { 1737 *ppos += error; 1738 file_accessed(in); 1739 } 1740 return error; 1741} 1742 1743/* 1744 * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. 1745 */ 1746static pgoff_t shmem_seek_hole_data(struct address_space *mapping, 1747 pgoff_t index, pgoff_t end, int whence) 1748{ 1749 struct page *page; 1750 struct pagevec pvec; 1751 pgoff_t indices[PAGEVEC_SIZE]; 1752 bool done = false; 1753 int i; 1754 1755 pagevec_init(&pvec, 0); 1756 pvec.nr = 1; /* start small: we may be there already */ 1757 while (!done) { 1758 pvec.nr = find_get_entries(mapping, index, 1759 pvec.nr, pvec.pages, indices); 1760 if (!pvec.nr) { 1761 if (whence == SEEK_DATA) 1762 index = end; 1763 break; 1764 } 1765 for (i = 0; i < pvec.nr; i++, index++) { 1766 if (index < indices[i]) { 1767 if (whence == SEEK_HOLE) { 1768 done = true; 1769 break; 1770 } 1771 index = indices[i]; 1772 } 1773 page = pvec.pages[i]; 1774 if (page && !radix_tree_exceptional_entry(page)) { 1775 if (!PageUptodate(page)) 1776 page = NULL; 1777 } 1778 if (index >= end || 1779 (page && whence == SEEK_DATA) || 1780 (!page && whence == SEEK_HOLE)) { 1781 done = true; 1782 break; 1783 } 1784 } 1785 pagevec_remove_exceptionals(&pvec); 1786 pagevec_release(&pvec); 1787 pvec.nr = PAGEVEC_SIZE; 1788 cond_resched(); 1789 } 1790 return index; 1791} 1792 1793static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) 1794{ 1795 struct address_space *mapping = file->f_mapping; 1796 struct inode *inode = mapping->host; 1797 pgoff_t start, end; 1798 loff_t new_offset; 1799 1800 if (whence != SEEK_DATA && whence != SEEK_HOLE) 1801 return generic_file_llseek_size(file, offset, whence, 1802 MAX_LFS_FILESIZE, i_size_read(inode)); 1803 mutex_lock(&inode->i_mutex); 1804 /* We're holding i_mutex so we can access i_size directly */ 1805 1806 if (offset < 0) 1807 offset = -EINVAL; 1808 else if (offset >= inode->i_size) 1809 offset = -ENXIO; 1810 else { 1811 start = offset >> PAGE_CACHE_SHIFT; 1812 end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1813 new_offset = shmem_seek_hole_data(mapping, start, end, whence); 1814 new_offset <<= PAGE_CACHE_SHIFT; 1815 if (new_offset > offset) { 1816 if (new_offset < inode->i_size) 1817 offset = new_offset; 1818 else if (whence == SEEK_DATA) 1819 offset = -ENXIO; 1820 else 1821 offset = inode->i_size; 1822 } 1823 } 1824 1825 if (offset >= 0) 1826 offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); 1827 mutex_unlock(&inode->i_mutex); 1828 return offset; 1829} 1830 1831static int shmem_wait_for_pins(struct address_space *mapping) 1832{ 1833 return 0; 1834} 1835 1836#define F_ALL_SEALS (F_SEAL_SEAL | \ 1837 F_SEAL_SHRINK | \ 1838 F_SEAL_GROW | \ 1839 F_SEAL_WRITE) 1840 1841int shmem_add_seals(struct file *file, unsigned int seals) 1842{ 1843 struct inode *inode = file_inode(file); 1844 struct shmem_inode_info *info = SHMEM_I(inode); 1845 int error; 1846 1847 /* 1848 * SEALING 1849 * Sealing allows multiple parties to share a shmem-file but restrict 1850 * access to a specific subset of file operations. Seals can only be 1851 * added, but never removed. This way, mutually untrusted parties can 1852 * share common memory regions with a well-defined policy. A malicious 1853 * peer can thus never perform unwanted operations on a shared object. 1854 * 1855 * Seals are only supported on special shmem-files and always affect 1856 * the whole underlying inode. Once a seal is set, it may prevent some 1857 * kinds of access to the file. Currently, the following seals are 1858 * defined: 1859 * SEAL_SEAL: Prevent further seals from being set on this file 1860 * SEAL_SHRINK: Prevent the file from shrinking 1861 * SEAL_GROW: Prevent the file from growing 1862 * SEAL_WRITE: Prevent write access to the file 1863 * 1864 * As we don't require any trust relationship between two parties, we 1865 * must prevent seals from being removed. Therefore, sealing a file 1866 * only adds a given set of seals to the file, it never touches 1867 * existing seals. Furthermore, the "setting seals"-operation can be 1868 * sealed itself, which basically prevents any further seal from being 1869 * added. 1870 * 1871 * Semantics of sealing are only defined on volatile files. Only 1872 * anonymous shmem files support sealing. More importantly, seals are 1873 * never written to disk. Therefore, there's no plan to support it on 1874 * other file types. 1875 */ 1876 1877 if (file->f_op != &shmem_file_operations) 1878 return -EINVAL; 1879 if (!(file->f_mode & FMODE_WRITE)) 1880 return -EPERM; 1881 if (seals & ~(unsigned int)F_ALL_SEALS) 1882 return -EINVAL; 1883 1884 mutex_lock(&inode->i_mutex); 1885 1886 if (info->seals & F_SEAL_SEAL) { 1887 error = -EPERM; 1888 goto unlock; 1889 } 1890 1891 if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) { 1892 error = mapping_deny_writable(file->f_mapping); 1893 if (error) 1894 goto unlock; 1895 1896 error = shmem_wait_for_pins(file->f_mapping); 1897 if (error) { 1898 mapping_allow_writable(file->f_mapping); 1899 goto unlock; 1900 } 1901 } 1902 1903 info->seals |= seals; 1904 error = 0; 1905 1906unlock: 1907 mutex_unlock(&inode->i_mutex); 1908 return error; 1909} 1910EXPORT_SYMBOL_GPL(shmem_add_seals); 1911 1912int shmem_get_seals(struct file *file) 1913{ 1914 if (file->f_op != &shmem_file_operations) 1915 return -EINVAL; 1916 1917 return SHMEM_I(file_inode(file))->seals; 1918} 1919EXPORT_SYMBOL_GPL(shmem_get_seals); 1920 1921long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1922{ 1923 long error; 1924 1925 switch (cmd) { 1926 case F_ADD_SEALS: 1927 /* disallow upper 32bit */ 1928 if (arg > UINT_MAX) 1929 return -EINVAL; 1930 1931 error = shmem_add_seals(file, arg); 1932 break; 1933 case F_GET_SEALS: 1934 error = shmem_get_seals(file); 1935 break; 1936 default: 1937 error = -EINVAL; 1938 break; 1939 } 1940 1941 return error; 1942} 1943 1944static long shmem_fallocate(struct file *file, int mode, loff_t offset, 1945 loff_t len) 1946{ 1947 struct inode *inode = file_inode(file); 1948 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 1949 struct shmem_inode_info *info = SHMEM_I(inode); 1950 struct shmem_falloc shmem_falloc; 1951 pgoff_t start, index, end; 1952 int error; 1953 1954 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 1955 return -EOPNOTSUPP; 1956 1957 mutex_lock(&inode->i_mutex); 1958 1959 if (mode & FALLOC_FL_PUNCH_HOLE) { 1960 struct address_space *mapping = file->f_mapping; 1961 loff_t unmap_start = round_up(offset, PAGE_SIZE); 1962 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; 1963 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); 1964 1965 /* protected by i_mutex */ 1966 if (info->seals & F_SEAL_WRITE) { 1967 error = -EPERM; 1968 goto out; 1969 } 1970 1971 shmem_falloc.waitq = &shmem_falloc_waitq; 1972 shmem_falloc.start = unmap_start >> PAGE_SHIFT; 1973 shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; 1974 spin_lock(&inode->i_lock); 1975 inode->i_private = &shmem_falloc; 1976 spin_unlock(&inode->i_lock); 1977 1978 if ((u64)unmap_end > (u64)unmap_start) 1979 unmap_mapping_range(mapping, unmap_start, 1980 1 + unmap_end - unmap_start, 0); 1981 shmem_truncate_range(inode, offset, offset + len - 1); 1982 /* No need to unmap again: hole-punching leaves COWed pages */ 1983 1984 spin_lock(&inode->i_lock); 1985 inode->i_private = NULL; 1986 wake_up_all(&shmem_falloc_waitq); 1987 spin_unlock(&inode->i_lock); 1988 error = 0; 1989 goto out; 1990 } 1991 1992 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ 1993 error = inode_newsize_ok(inode, offset + len); 1994 if (error) 1995 goto out; 1996 1997 if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { 1998 error = -EPERM; 1999 goto out; 2000 } 2001 2002 start = offset >> PAGE_CACHE_SHIFT; 2003 end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 2004 /* Try to avoid a swapstorm if len is impossible to satisfy */ 2005 if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { 2006 error = -ENOSPC; 2007 goto out; 2008 } 2009 2010 shmem_falloc.waitq = NULL; 2011 shmem_falloc.start = start; 2012 shmem_falloc.next = start; 2013 shmem_falloc.nr_falloced = 0; 2014 shmem_falloc.nr_unswapped = 0; 2015 spin_lock(&inode->i_lock); 2016 inode->i_private = &shmem_falloc; 2017 spin_unlock(&inode->i_lock); 2018 2019 for (index = start; index < end; index++) { 2020 struct page *page; 2021 2022 /* 2023 * Good, the fallocate(2) manpage permits EINTR: we may have 2024 * been interrupted because we are using up too much memory. 2025 */ 2026 if (signal_pending(current)) 2027 error = -EINTR; 2028 else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) 2029 error = -ENOMEM; 2030 else 2031 error = shmem_getpage(inode, index, &page, SGP_FALLOC, 2032 NULL); 2033 if (error) { 2034 /* Remove the !PageUptodate pages we added */ 2035 shmem_undo_range(inode, 2036 (loff_t)start << PAGE_CACHE_SHIFT, 2037 (loff_t)index << PAGE_CACHE_SHIFT, true); 2038 goto undone; 2039 } 2040 2041 /* 2042 * Inform shmem_writepage() how far we have reached. 2043 * No need for lock or barrier: we have the page lock. 2044 */ 2045 shmem_falloc.next++; 2046 if (!PageUptodate(page)) 2047 shmem_falloc.nr_falloced++; 2048 2049 /* 2050 * If !PageUptodate, leave it that way so that freeable pages 2051 * can be recognized if we need to rollback on error later. 2052 * But set_page_dirty so that memory pressure will swap rather 2053 * than free the pages we are allocating (and SGP_CACHE pages 2054 * might still be clean: we now need to mark those dirty too). 2055 */ 2056 set_page_dirty(page); 2057 unlock_page(page); 2058 page_cache_release(page); 2059 cond_resched(); 2060 } 2061 2062 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) 2063 i_size_write(inode, offset + len); 2064 inode->i_ctime = CURRENT_TIME; 2065undone: 2066 spin_lock(&inode->i_lock); 2067 inode->i_private = NULL; 2068 spin_unlock(&inode->i_lock); 2069out: 2070 mutex_unlock(&inode->i_mutex); 2071 return error; 2072} 2073 2074static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 2075{ 2076 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 2077 2078 buf->f_type = TMPFS_MAGIC; 2079 buf->f_bsize = PAGE_CACHE_SIZE; 2080 buf->f_namelen = NAME_MAX; 2081 if (sbinfo->max_blocks) { 2082 buf->f_blocks = sbinfo->max_blocks; 2083 buf->f_bavail = 2084 buf->f_bfree = sbinfo->max_blocks - 2085 percpu_counter_sum(&sbinfo->used_blocks); 2086 } 2087 if (sbinfo->max_inodes) { 2088 buf->f_files = sbinfo->max_inodes; 2089 buf->f_ffree = sbinfo->free_inodes; 2090 } 2091 /* else leave those fields 0 like simple_statfs */ 2092 return 0; 2093} 2094 2095/* 2096 * File creation. Allocate an inode, and we're done.. 2097 */ 2098static int 2099shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) 2100{ 2101 struct inode *inode; 2102 int error = -ENOSPC; 2103 2104 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); 2105 if (inode) { 2106 error = simple_acl_create(dir, inode); 2107 if (error) 2108 goto out_iput; 2109 error = security_inode_init_security(inode, dir, 2110 &dentry->d_name, 2111 shmem_initxattrs, NULL); 2112 if (error && error != -EOPNOTSUPP) 2113 goto out_iput; 2114 2115 error = 0; 2116 dir->i_size += BOGO_DIRENT_SIZE; 2117 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 2118 d_instantiate(dentry, inode); 2119 dget(dentry); /* Extra count - pin the dentry in core */ 2120 } 2121 return error; 2122out_iput: 2123 iput(inode); 2124 return error; 2125} 2126 2127static int 2128shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) 2129{ 2130 struct inode *inode; 2131 int error = -ENOSPC; 2132 2133 inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE); 2134 if (inode) { 2135 error = security_inode_init_security(inode, dir, 2136 NULL, 2137 shmem_initxattrs, NULL); 2138 if (error && error != -EOPNOTSUPP) 2139 goto out_iput; 2140 error = simple_acl_create(dir, inode); 2141 if (error) 2142 goto out_iput; 2143 d_tmpfile(dentry, inode); 2144 } 2145 return error; 2146out_iput: 2147 iput(inode); 2148 return error; 2149} 2150 2151static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 2152{ 2153 int error; 2154 2155 if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) 2156 return error; 2157 inc_nlink(dir); 2158 return 0; 2159} 2160 2161static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, 2162 bool excl) 2163{ 2164 return shmem_mknod(dir, dentry, mode | S_IFREG, 0); 2165} 2166 2167/* 2168 * Link a file.. 2169 */ 2170static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 2171{ 2172 struct inode *inode = old_dentry->d_inode; 2173 int ret; 2174 2175 /* 2176 * No ordinary (disk based) filesystem counts links as inodes; 2177 * but each new link needs a new dentry, pinning lowmem, and 2178 * tmpfs dentries cannot be pruned until they are unlinked. 2179 */ 2180 ret = shmem_reserve_inode(inode->i_sb); 2181 if (ret) 2182 goto out; 2183 2184 dir->i_size += BOGO_DIRENT_SIZE; 2185 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 2186 inc_nlink(inode); 2187 ihold(inode); /* New dentry reference */ 2188 dget(dentry); /* Extra pinning count for the created dentry */ 2189 d_instantiate(dentry, inode); 2190out: 2191 return ret; 2192} 2193 2194static int shmem_unlink(struct inode *dir, struct dentry *dentry) 2195{ 2196 struct inode *inode = dentry->d_inode; 2197 2198 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) 2199 shmem_free_inode(inode->i_sb); 2200 2201 dir->i_size -= BOGO_DIRENT_SIZE; 2202 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 2203 drop_nlink(inode); 2204 dput(dentry); /* Undo the count from "create" - this does all the work */ 2205 return 0; 2206} 2207 2208static int shmem_rmdir(struct inode *dir, struct dentry *dentry) 2209{ 2210 if (!simple_empty(dentry)) 2211 return -ENOTEMPTY; 2212 2213 drop_nlink(dentry->d_inode); 2214 drop_nlink(dir); 2215 return shmem_unlink(dir, dentry); 2216} 2217 2218/* 2219 * The VFS layer already does all the dentry stuff for rename, 2220 * we just have to decrement the usage count for the target if 2221 * it exists so that the VFS layer correctly free's it when it 2222 * gets overwritten. 2223 */ 2224static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) 2225{ 2226 struct inode *inode = old_dentry->d_inode; 2227 int they_are_dirs = S_ISDIR(inode->i_mode); 2228 2229 if (!simple_empty(new_dentry)) 2230 return -ENOTEMPTY; 2231 2232 if (new_dentry->d_inode) { 2233 (void) shmem_unlink(new_dir, new_dentry); 2234 if (they_are_dirs) 2235 drop_nlink(old_dir); 2236 } else if (they_are_dirs) { 2237 drop_nlink(old_dir); 2238 inc_nlink(new_dir); 2239 } 2240 2241 old_dir->i_size -= BOGO_DIRENT_SIZE; 2242 new_dir->i_size += BOGO_DIRENT_SIZE; 2243 old_dir->i_ctime = old_dir->i_mtime = 2244 new_dir->i_ctime = new_dir->i_mtime = 2245 inode->i_ctime = CURRENT_TIME; 2246 return 0; 2247} 2248 2249static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname) 2250{ 2251 int error; 2252 int len; 2253 struct inode *inode; 2254 struct page *page; 2255 char *kaddr; 2256 struct shmem_inode_info *info; 2257 2258 len = strlen(symname) + 1; 2259 if (len > PAGE_CACHE_SIZE) 2260 return -ENAMETOOLONG; 2261 2262 inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); 2263 if (!inode) 2264 return -ENOSPC; 2265 2266 error = security_inode_init_security(inode, dir, &dentry->d_name, 2267 shmem_initxattrs, NULL); 2268 if (error) { 2269 if (error != -EOPNOTSUPP) { 2270 iput(inode); 2271 return error; 2272 } 2273 error = 0; 2274 } 2275 2276 info = SHMEM_I(inode); 2277 inode->i_size = len-1; 2278 if (len <= SHORT_SYMLINK_LEN) { 2279 info->symlink = kmemdup(symname, len, GFP_KERNEL); 2280 if (!info->symlink) { 2281 iput(inode); 2282 return -ENOMEM; 2283 } 2284 inode->i_op = &shmem_short_symlink_operations; 2285 } else { 2286 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); 2287 if (error) { 2288 iput(inode); 2289 return error; 2290 } 2291 inode->i_mapping->a_ops = &shmem_aops; 2292 inode->i_op = &shmem_symlink_inode_operations; 2293 kaddr = kmap_atomic(page); 2294 memcpy(kaddr, symname, len); 2295 kunmap_atomic(kaddr); 2296 SetPageUptodate(page); 2297 set_page_dirty(page); 2298 unlock_page(page); 2299 page_cache_release(page); 2300 } 2301 dir->i_size += BOGO_DIRENT_SIZE; 2302 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 2303 d_instantiate(dentry, inode); 2304 dget(dentry); 2305 return 0; 2306} 2307 2308static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd) 2309{ 2310 nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink); 2311 return NULL; 2312} 2313 2314static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) 2315{ 2316 struct page *page = NULL; 2317 int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); 2318 nd_set_link(nd, error ? ERR_PTR(error) : kmap(page)); 2319 if (page) 2320 unlock_page(page); 2321 return page; 2322} 2323 2324static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) 2325{ 2326 if (!IS_ERR(nd_get_link(nd))) { 2327 struct page *page = cookie; 2328 kunmap(page); 2329 mark_page_accessed(page); 2330 page_cache_release(page); 2331 } 2332} 2333 2334#ifdef CONFIG_TMPFS_XATTR 2335/* 2336 * Superblocks without xattr inode operations may get some security.* xattr 2337 * support from the LSM "for free". As soon as we have any other xattrs 2338 * like ACLs, we also need to implement the security.* handlers at 2339 * filesystem level, though. 2340 */ 2341 2342/* 2343 * Callback for security_inode_init_security() for acquiring xattrs. 2344 */ 2345static int shmem_initxattrs(struct inode *inode, 2346 const struct xattr *xattr_array, 2347 void *fs_info) 2348{ 2349 struct shmem_inode_info *info = SHMEM_I(inode); 2350 const struct xattr *xattr; 2351 struct simple_xattr *new_xattr; 2352 size_t len; 2353 2354 for (xattr = xattr_array; xattr->name != NULL; xattr++) { 2355 new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); 2356 if (!new_xattr) 2357 return -ENOMEM; 2358 2359 len = strlen(xattr->name) + 1; 2360 new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, 2361 GFP_KERNEL); 2362 if (!new_xattr->name) { 2363 kfree(new_xattr); 2364 return -ENOMEM; 2365 } 2366 2367 memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, 2368 XATTR_SECURITY_PREFIX_LEN); 2369 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, 2370 xattr->name, len); 2371 2372 simple_xattr_list_add(&info->xattrs, new_xattr); 2373 } 2374 2375 return 0; 2376} 2377 2378static const struct xattr_handler *shmem_xattr_handlers[] = { 2379#ifdef CONFIG_TMPFS_POSIX_ACL 2380 &posix_acl_access_xattr_handler, 2381 &posix_acl_default_xattr_handler, 2382#endif 2383 NULL 2384}; 2385 2386static int shmem_xattr_validate(const char *name) 2387{ 2388 struct { const char *prefix; size_t len; } arr[] = { 2389 { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN }, 2390 { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN } 2391 }; 2392 int i; 2393 2394 for (i = 0; i < ARRAY_SIZE(arr); i++) { 2395 size_t preflen = arr[i].len; 2396 if (strncmp(name, arr[i].prefix, preflen) == 0) { 2397 if (!name[preflen]) 2398 return -EINVAL; 2399 return 0; 2400 } 2401 } 2402 return -EOPNOTSUPP; 2403} 2404 2405static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, 2406 void *buffer, size_t size) 2407{ 2408 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); 2409 int err; 2410 2411 /* 2412 * If this is a request for a synthetic attribute in the system.* 2413 * namespace use the generic infrastructure to resolve a handler 2414 * for it via sb->s_xattr. 2415 */ 2416 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 2417 return generic_getxattr(dentry, name, buffer, size); 2418 2419 err = shmem_xattr_validate(name); 2420 if (err) 2421 return err; 2422 2423 return simple_xattr_get(&info->xattrs, name, buffer, size); 2424} 2425 2426static int shmem_setxattr(struct dentry *dentry, const char *name, 2427 const void *value, size_t size, int flags) 2428{ 2429 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); 2430 int err; 2431 2432 /* 2433 * If this is a request for a synthetic attribute in the system.* 2434 * namespace use the generic infrastructure to resolve a handler 2435 * for it via sb->s_xattr. 2436 */ 2437 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 2438 return generic_setxattr(dentry, name, value, size, flags); 2439 2440 err = shmem_xattr_validate(name); 2441 if (err) 2442 return err; 2443 2444 return simple_xattr_set(&info->xattrs, name, value, size, flags); 2445} 2446 2447static int shmem_removexattr(struct dentry *dentry, const char *name) 2448{ 2449 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); 2450 int err; 2451 2452 /* 2453 * If this is a request for a synthetic attribute in the system.* 2454 * namespace use the generic infrastructure to resolve a handler 2455 * for it via sb->s_xattr. 2456 */ 2457 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 2458 return generic_removexattr(dentry, name); 2459 2460 err = shmem_xattr_validate(name); 2461 if (err) 2462 return err; 2463 2464 return simple_xattr_remove(&info->xattrs, name); 2465} 2466 2467static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) 2468{ 2469 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); 2470 return simple_xattr_list(&info->xattrs, buffer, size); 2471} 2472#endif /* CONFIG_TMPFS_XATTR */ 2473 2474static const struct inode_operations shmem_short_symlink_operations = { 2475 .readlink = generic_readlink, 2476 .follow_link = shmem_follow_short_symlink, 2477#ifdef CONFIG_TMPFS_XATTR 2478 .setxattr = shmem_setxattr, 2479 .getxattr = shmem_getxattr, 2480 .listxattr = shmem_listxattr, 2481 .removexattr = shmem_removexattr, 2482#endif 2483}; 2484 2485static const struct inode_operations shmem_symlink_inode_operations = { 2486 .readlink = generic_readlink, 2487 .follow_link = shmem_follow_link, 2488 .put_link = shmem_put_link, 2489#ifdef CONFIG_TMPFS_XATTR 2490 .setxattr = shmem_setxattr, 2491 .getxattr = shmem_getxattr, 2492 .listxattr = shmem_listxattr, 2493 .removexattr = shmem_removexattr, 2494#endif 2495}; 2496 2497static struct dentry *shmem_get_parent(struct dentry *child) 2498{ 2499 return ERR_PTR(-ESTALE); 2500} 2501 2502static int shmem_match(struct inode *ino, void *vfh) 2503{ 2504 __u32 *fh = vfh; 2505 __u64 inum = fh[2]; 2506 inum = (inum << 32) | fh[1]; 2507 return ino->i_ino == inum && fh[0] == ino->i_generation; 2508} 2509 2510static struct dentry *shmem_fh_to_dentry(struct super_block *sb, 2511 struct fid *fid, int fh_len, int fh_type) 2512{ 2513 struct inode *inode; 2514 struct dentry *dentry = NULL; 2515 u64 inum; 2516 2517 if (fh_len < 3) 2518 return NULL; 2519 2520 inum = fid->raw[2]; 2521 inum = (inum << 32) | fid->raw[1]; 2522 2523 inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), 2524 shmem_match, fid->raw); 2525 if (inode) { 2526 dentry = d_find_alias(inode); 2527 iput(inode); 2528 } 2529 2530 return dentry; 2531} 2532 2533static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, 2534 struct inode *parent) 2535{ 2536 if (*len < 3) { 2537 *len = 3; 2538 return FILEID_INVALID; 2539 } 2540 2541 if (inode_unhashed(inode)) { 2542 /* Unfortunately insert_inode_hash is not idempotent, 2543 * so as we hash inodes here rather than at creation 2544 * time, we need a lock to ensure we only try 2545 * to do it once 2546 */ 2547 static DEFINE_SPINLOCK(lock); 2548 spin_lock(&lock); 2549 if (inode_unhashed(inode)) 2550 __insert_inode_hash(inode, 2551 inode->i_ino + inode->i_generation); 2552 spin_unlock(&lock); 2553 } 2554 2555 fh[0] = inode->i_generation; 2556 fh[1] = inode->i_ino; 2557 fh[2] = ((__u64)inode->i_ino) >> 32; 2558 2559 *len = 3; 2560 return 1; 2561} 2562 2563static const struct export_operations shmem_export_ops = { 2564 .get_parent = shmem_get_parent, 2565 .encode_fh = shmem_encode_fh, 2566 .fh_to_dentry = shmem_fh_to_dentry, 2567}; 2568 2569static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, 2570 bool remount) 2571{ 2572 char *this_char, *value, *rest; 2573 struct mempolicy *mpol = NULL; 2574 uid_t uid; 2575 gid_t gid; 2576 2577 while (options != NULL) { 2578 this_char = options; 2579 for (;;) { 2580 /* 2581 * NUL-terminate this option: unfortunately, 2582 * mount options form a comma-separated list, 2583 * but mpol's nodelist may also contain commas. 2584 */ 2585 options = strchr(options, ','); 2586 if (options == NULL) 2587 break; 2588 options++; 2589 if (!isdigit(*options)) { 2590 options[-1] = '\0'; 2591 break; 2592 } 2593 } 2594 if (!*this_char) 2595 continue; 2596 if ((value = strchr(this_char,'=')) != NULL) { 2597 *value++ = 0; 2598 } else { 2599 printk(KERN_ERR 2600 "tmpfs: No value for mount option '%s'\n", 2601 this_char); 2602 goto error; 2603 } 2604 2605 if (!strcmp(this_char,"size")) { 2606 unsigned long long size; 2607 size = memparse(value,&rest); 2608 if (*rest == '%') { 2609 size <<= PAGE_SHIFT; 2610 size *= totalram_pages; 2611 do_div(size, 100); 2612 rest++; 2613 } 2614 if (*rest) 2615 goto bad_val; 2616 sbinfo->max_blocks = 2617 DIV_ROUND_UP(size, PAGE_CACHE_SIZE); 2618 } else if (!strcmp(this_char,"nr_blocks")) { 2619 sbinfo->max_blocks = memparse(value, &rest); 2620 if (*rest) 2621 goto bad_val; 2622 } else if (!strcmp(this_char,"nr_inodes")) { 2623 sbinfo->max_inodes = memparse(value, &rest); 2624 if (*rest) 2625 goto bad_val; 2626 } else if (!strcmp(this_char,"mode")) { 2627 if (remount) 2628 continue; 2629 sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777; 2630 if (*rest) 2631 goto bad_val; 2632 } else if (!strcmp(this_char,"uid")) { 2633 if (remount) 2634 continue; 2635 uid = simple_strtoul(value, &rest, 0); 2636 if (*rest) 2637 goto bad_val; 2638 sbinfo->uid = make_kuid(current_user_ns(), uid); 2639 if (!uid_valid(sbinfo->uid)) 2640 goto bad_val; 2641 } else if (!strcmp(this_char,"gid")) { 2642 if (remount) 2643 continue; 2644 gid = simple_strtoul(value, &rest, 0); 2645 if (*rest) 2646 goto bad_val; 2647 sbinfo->gid = make_kgid(current_user_ns(), gid); 2648 if (!gid_valid(sbinfo->gid)) 2649 goto bad_val; 2650 } else if (!strcmp(this_char,"mpol")) { 2651 mpol_put(mpol); 2652 mpol = NULL; 2653 if (mpol_parse_str(value, &mpol)) 2654 goto bad_val; 2655 } else { 2656 printk(KERN_ERR "tmpfs: Bad mount option %s\n", 2657 this_char); 2658 goto error; 2659 } 2660 } 2661 sbinfo->mpol = mpol; 2662 return 0; 2663 2664bad_val: 2665 printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", 2666 value, this_char); 2667error: 2668 mpol_put(mpol); 2669 return 1; 2670 2671} 2672 2673static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) 2674{ 2675 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2676 struct shmem_sb_info config = *sbinfo; 2677 unsigned long inodes; 2678 int error = -EINVAL; 2679 2680 config.mpol = NULL; 2681 if (shmem_parse_options(data, &config, true)) 2682 return error; 2683 2684 spin_lock(&sbinfo->stat_lock); 2685 inodes = sbinfo->max_inodes - sbinfo->free_inodes; 2686 if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0) 2687 goto out; 2688 if (config.max_inodes < inodes) 2689 goto out; 2690 /* 2691 * Those tests disallow limited->unlimited while any are in use; 2692 * but we must separately disallow unlimited->limited, because 2693 * in that case we have no record of how much is already in use. 2694 */ 2695 if (config.max_blocks && !sbinfo->max_blocks) 2696 goto out; 2697 if (config.max_inodes && !sbinfo->max_inodes) 2698 goto out; 2699 2700 error = 0; 2701 sbinfo->max_blocks = config.max_blocks; 2702 sbinfo->max_inodes = config.max_inodes; 2703 sbinfo->free_inodes = config.max_inodes - inodes; 2704 2705 /* 2706 * Preserve previous mempolicy unless mpol remount option was specified. 2707 */ 2708 if (config.mpol) { 2709 mpol_put(sbinfo->mpol); 2710 sbinfo->mpol = config.mpol; /* transfers initial ref */ 2711 } 2712out: 2713 spin_unlock(&sbinfo->stat_lock); 2714 return error; 2715} 2716 2717static int shmem_show_options(struct seq_file *seq, struct dentry *root) 2718{ 2719 struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); 2720 2721 if (sbinfo->max_blocks != shmem_default_max_blocks()) 2722 seq_printf(seq, ",size=%luk", 2723 sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10)); 2724 if (sbinfo->max_inodes != shmem_default_max_inodes()) 2725 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); 2726 if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) 2727 seq_printf(seq, ",mode=%03ho", sbinfo->mode); 2728 if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) 2729 seq_printf(seq, ",uid=%u", 2730 from_kuid_munged(&init_user_ns, sbinfo->uid)); 2731 if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) 2732 seq_printf(seq, ",gid=%u", 2733 from_kgid_munged(&init_user_ns, sbinfo->gid)); 2734 shmem_show_mpol(seq, sbinfo->mpol); 2735 return 0; 2736} 2737 2738#define MFD_NAME_PREFIX "memfd:" 2739#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) 2740#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) 2741 2742#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING) 2743 2744SYSCALL_DEFINE2(memfd_create, 2745 const char __user *, uname, 2746 unsigned int, flags) 2747{ 2748 struct shmem_inode_info *info; 2749 struct file *file; 2750 int fd, error; 2751 char *name; 2752 long len; 2753 2754 if (flags & ~(unsigned int)MFD_ALL_FLAGS) 2755 return -EINVAL; 2756 2757 /* length includes terminating zero */ 2758 len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); 2759 if (len <= 0) 2760 return -EFAULT; 2761 if (len > MFD_NAME_MAX_LEN + 1) 2762 return -EINVAL; 2763 2764 name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY); 2765 if (!name) 2766 return -ENOMEM; 2767 2768 strcpy(name, MFD_NAME_PREFIX); 2769 if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { 2770 error = -EFAULT; 2771 goto err_name; 2772 } 2773 2774 /* terminating-zero may have changed after strnlen_user() returned */ 2775 if (name[len + MFD_NAME_PREFIX_LEN - 1]) { 2776 error = -EFAULT; 2777 goto err_name; 2778 } 2779 2780 fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); 2781 if (fd < 0) { 2782 error = fd; 2783 goto err_name; 2784 } 2785 2786 file = shmem_file_setup(name, 0, VM_NORESERVE); 2787 if (IS_ERR(file)) { 2788 error = PTR_ERR(file); 2789 goto err_fd; 2790 } 2791 info = SHMEM_I(file_inode(file)); 2792 file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; 2793 file->f_flags |= O_RDWR | O_LARGEFILE; 2794 if (flags & MFD_ALLOW_SEALING) 2795 info->seals &= ~F_SEAL_SEAL; 2796 2797 fd_install(fd, file); 2798 kfree(name); 2799 return fd; 2800 2801err_fd: 2802 put_unused_fd(fd); 2803err_name: 2804 kfree(name); 2805 return error; 2806} 2807 2808#endif /* CONFIG_TMPFS */ 2809 2810static void shmem_put_super(struct super_block *sb) 2811{ 2812 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2813 2814 percpu_counter_destroy(&sbinfo->used_blocks); 2815 mpol_put(sbinfo->mpol); 2816 kfree(sbinfo); 2817 sb->s_fs_info = NULL; 2818} 2819 2820int shmem_fill_super(struct super_block *sb, void *data, int silent) 2821{ 2822 struct inode *inode; 2823 struct shmem_sb_info *sbinfo; 2824 int err = -ENOMEM; 2825 2826 /* Round up to L1_CACHE_BYTES to resist false sharing */ 2827 sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), 2828 L1_CACHE_BYTES), GFP_KERNEL); 2829 if (!sbinfo) 2830 return -ENOMEM; 2831 2832 sbinfo->mode = S_IRWXUGO | S_ISVTX; 2833 sbinfo->uid = current_fsuid(); 2834 sbinfo->gid = current_fsgid(); 2835 sb->s_fs_info = sbinfo; 2836 2837#ifdef CONFIG_TMPFS 2838 /* 2839 * Per default we only allow half of the physical ram per 2840 * tmpfs instance, limiting inodes to one per page of lowmem; 2841 * but the internal instance is left unlimited. 2842 */ 2843 if (!(sb->s_flags & MS_KERNMOUNT)) { 2844 sbinfo->max_blocks = shmem_default_max_blocks(); 2845 sbinfo->max_inodes = shmem_default_max_inodes(); 2846 if (shmem_parse_options(data, sbinfo, false)) { 2847 err = -EINVAL; 2848 goto failed; 2849 } 2850 } else { 2851 sb->s_flags |= MS_NOUSER; 2852 } 2853 sb->s_export_op = &shmem_export_ops; 2854 sb->s_flags |= MS_NOSEC; 2855#else 2856 sb->s_flags |= MS_NOUSER; 2857#endif 2858 2859 spin_lock_init(&sbinfo->stat_lock); 2860 if (percpu_counter_init(&sbinfo->used_blocks, 0)) 2861 goto failed; 2862 sbinfo->free_inodes = sbinfo->max_inodes; 2863 2864 sb->s_maxbytes = MAX_LFS_FILESIZE; 2865 sb->s_blocksize = PAGE_CACHE_SIZE; 2866 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 2867 sb->s_magic = TMPFS_MAGIC; 2868 sb->s_op = &shmem_ops; 2869 sb->s_time_gran = 1; 2870#ifdef CONFIG_TMPFS_XATTR 2871 sb->s_xattr = shmem_xattr_handlers; 2872#endif 2873#ifdef CONFIG_TMPFS_POSIX_ACL 2874 sb->s_flags |= MS_POSIXACL; 2875#endif 2876 2877 inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); 2878 if (!inode) 2879 goto failed; 2880 inode->i_uid = sbinfo->uid; 2881 inode->i_gid = sbinfo->gid; 2882 sb->s_root = d_make_root(inode); 2883 if (!sb->s_root) 2884 goto failed; 2885 return 0; 2886 2887failed: 2888 shmem_put_super(sb); 2889 return err; 2890} 2891 2892static struct kmem_cache *shmem_inode_cachep; 2893 2894static struct inode *shmem_alloc_inode(struct super_block *sb) 2895{ 2896 struct shmem_inode_info *info; 2897 info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); 2898 if (!info) 2899 return NULL; 2900 return &info->vfs_inode; 2901} 2902 2903static void shmem_destroy_callback(struct rcu_head *head) 2904{ 2905 struct inode *inode = container_of(head, struct inode, i_rcu); 2906 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2907} 2908 2909static void shmem_destroy_inode(struct inode *inode) 2910{ 2911 if (S_ISREG(inode->i_mode)) 2912 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 2913 call_rcu(&inode->i_rcu, shmem_destroy_callback); 2914} 2915 2916static void shmem_init_inode(void *foo) 2917{ 2918 struct shmem_inode_info *info = foo; 2919 inode_init_once(&info->vfs_inode); 2920} 2921 2922static int shmem_init_inodecache(void) 2923{ 2924 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 2925 sizeof(struct shmem_inode_info), 2926 0, SLAB_PANIC, shmem_init_inode); 2927 return 0; 2928} 2929 2930static void shmem_destroy_inodecache(void) 2931{ 2932 kmem_cache_destroy(shmem_inode_cachep); 2933} 2934 2935static const struct address_space_operations shmem_aops = { 2936 .writepage = shmem_writepage, 2937 .set_page_dirty = __set_page_dirty_no_writeback, 2938#ifdef CONFIG_TMPFS 2939 .write_begin = shmem_write_begin, 2940 .write_end = shmem_write_end, 2941#endif 2942 .migratepage = migrate_page, 2943 .error_remove_page = generic_error_remove_page, 2944}; 2945 2946static const struct file_operations shmem_file_operations = { 2947 .mmap = shmem_mmap, 2948#ifdef CONFIG_TMPFS 2949 .llseek = shmem_file_llseek, 2950 .read = new_sync_read, 2951 .write = new_sync_write, 2952 .read_iter = shmem_file_read_iter, 2953 .write_iter = generic_file_write_iter, 2954 .fsync = noop_fsync, 2955 .splice_read = shmem_file_splice_read, 2956 .splice_write = iter_file_splice_write, 2957 .fallocate = shmem_fallocate, 2958#endif 2959}; 2960 2961static const struct inode_operations shmem_inode_operations = { 2962 .setattr = shmem_setattr, 2963#ifdef CONFIG_TMPFS_XATTR 2964 .setxattr = shmem_setxattr, 2965 .getxattr = shmem_getxattr, 2966 .listxattr = shmem_listxattr, 2967 .removexattr = shmem_removexattr, 2968 .set_acl = simple_set_acl, 2969#endif 2970}; 2971 2972static const struct inode_operations shmem_dir_inode_operations = { 2973#ifdef CONFIG_TMPFS 2974 .create = shmem_create, 2975 .lookup = simple_lookup, 2976 .link = shmem_link, 2977 .unlink = shmem_unlink, 2978 .symlink = shmem_symlink, 2979 .mkdir = shmem_mkdir, 2980 .rmdir = shmem_rmdir, 2981 .mknod = shmem_mknod, 2982 .rename = shmem_rename, 2983 .tmpfile = shmem_tmpfile, 2984#endif 2985#ifdef CONFIG_TMPFS_XATTR 2986 .setxattr = shmem_setxattr, 2987 .getxattr = shmem_getxattr, 2988 .listxattr = shmem_listxattr, 2989 .removexattr = shmem_removexattr, 2990#endif 2991#ifdef CONFIG_TMPFS_POSIX_ACL 2992 .setattr = shmem_setattr, 2993 .set_acl = simple_set_acl, 2994#endif 2995}; 2996 2997static const struct inode_operations shmem_special_inode_operations = { 2998#ifdef CONFIG_TMPFS_XATTR 2999 .setxattr = shmem_setxattr, 3000 .getxattr = shmem_getxattr, 3001 .listxattr = shmem_listxattr, 3002 .removexattr = shmem_removexattr, 3003#endif 3004#ifdef CONFIG_TMPFS_POSIX_ACL 3005 .setattr = shmem_setattr, 3006 .set_acl = simple_set_acl, 3007#endif 3008}; 3009 3010static const struct super_operations shmem_ops = { 3011 .alloc_inode = shmem_alloc_inode, 3012 .destroy_inode = shmem_destroy_inode, 3013#ifdef CONFIG_TMPFS 3014 .statfs = shmem_statfs, 3015 .remount_fs = shmem_remount_fs, 3016 .show_options = shmem_show_options, 3017#endif 3018 .evict_inode = shmem_evict_inode, 3019 .drop_inode = generic_delete_inode, 3020 .put_super = shmem_put_super, 3021}; 3022 3023static const struct vm_operations_struct shmem_vm_ops = { 3024 .fault = shmem_fault, 3025 .map_pages = filemap_map_pages, 3026#ifdef CONFIG_NUMA 3027 .set_policy = shmem_set_policy, 3028 .get_policy = shmem_get_policy, 3029#endif 3030 .remap_pages = generic_file_remap_pages, 3031}; 3032 3033static struct dentry *shmem_mount(struct file_system_type *fs_type, 3034 int flags, const char *dev_name, void *data) 3035{ 3036 return mount_nodev(fs_type, flags, data, shmem_fill_super); 3037} 3038 3039static struct file_system_type shmem_fs_type = { 3040 .owner = THIS_MODULE, 3041 .name = "tmpfs", 3042 .mount = shmem_mount, 3043 .kill_sb = kill_litter_super, 3044 .fs_flags = FS_USERNS_MOUNT, 3045}; 3046 3047int __init shmem_init(void) 3048{ 3049 int error; 3050 3051 /* If rootfs called this, don't re-init */ 3052 if (shmem_inode_cachep) 3053 return 0; 3054 3055 error = bdi_init(&shmem_backing_dev_info); 3056 if (error) 3057 goto out4; 3058 3059 error = shmem_init_inodecache(); 3060 if (error) 3061 goto out3; 3062 3063 error = register_filesystem(&shmem_fs_type); 3064 if (error) { 3065 printk(KERN_ERR "Could not register tmpfs\n"); 3066 goto out2; 3067 } 3068 3069 shm_mnt = kern_mount(&shmem_fs_type); 3070 if (IS_ERR(shm_mnt)) { 3071 error = PTR_ERR(shm_mnt); 3072 printk(KERN_ERR "Could not kern_mount tmpfs\n"); 3073 goto out1; 3074 } 3075 return 0; 3076 3077out1: 3078 unregister_filesystem(&shmem_fs_type); 3079out2: 3080 shmem_destroy_inodecache(); 3081out3: 3082 bdi_destroy(&shmem_backing_dev_info); 3083out4: 3084 shm_mnt = ERR_PTR(error); 3085 return error; 3086} 3087 3088#else /* !CONFIG_SHMEM */ 3089 3090/* 3091 * tiny-shmem: simple shmemfs and tmpfs using ramfs code 3092 * 3093 * This is intended for small system where the benefits of the full 3094 * shmem code (swap-backed and resource-limited) are outweighed by 3095 * their complexity. On systems without swap this code should be 3096 * effectively equivalent, but much lighter weight. 3097 */ 3098 3099static struct file_system_type shmem_fs_type = { 3100 .name = "tmpfs", 3101 .mount = ramfs_mount, 3102 .kill_sb = kill_litter_super, 3103 .fs_flags = FS_USERNS_MOUNT, 3104}; 3105 3106int __init shmem_init(void) 3107{ 3108 BUG_ON(register_filesystem(&shmem_fs_type) != 0); 3109 3110 shm_mnt = kern_mount(&shmem_fs_type); 3111 BUG_ON(IS_ERR(shm_mnt)); 3112 3113 return 0; 3114} 3115 3116int shmem_unuse(swp_entry_t swap, struct page *page) 3117{ 3118 return 0; 3119} 3120 3121int shmem_lock(struct file *file, int lock, struct user_struct *user) 3122{ 3123 return 0; 3124} 3125 3126void shmem_unlock_mapping(struct address_space *mapping) 3127{ 3128} 3129 3130void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 3131{ 3132 truncate_inode_pages_range(inode->i_mapping, lstart, lend); 3133} 3134EXPORT_SYMBOL_GPL(shmem_truncate_range); 3135 3136#define shmem_vm_ops generic_file_vm_ops 3137#define shmem_file_operations ramfs_file_operations 3138#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) 3139#define shmem_acct_size(flags, size) 0 3140#define shmem_unacct_size(flags, size) do {} while (0) 3141 3142#endif /* CONFIG_SHMEM */ 3143 3144/* common code */ 3145 3146static struct dentry_operations anon_ops = { 3147 .d_dname = simple_dname 3148}; 3149 3150static struct file *__shmem_file_setup(const char *name, loff_t size, 3151 unsigned long flags, unsigned int i_flags) 3152{ 3153 struct file *res; 3154 struct inode *inode; 3155 struct path path; 3156 struct super_block *sb; 3157 struct qstr this; 3158 3159 if (IS_ERR(shm_mnt)) 3160 return ERR_CAST(shm_mnt); 3161 3162 if (size < 0 || size > MAX_LFS_FILESIZE) 3163 return ERR_PTR(-EINVAL); 3164 3165 if (shmem_acct_size(flags, size)) 3166 return ERR_PTR(-ENOMEM); 3167 3168 res = ERR_PTR(-ENOMEM); 3169 this.name = name; 3170 this.len = strlen(name); 3171 this.hash = 0; /* will go */ 3172 sb = shm_mnt->mnt_sb; 3173 path.mnt = mntget(shm_mnt); 3174 path.dentry = d_alloc_pseudo(sb, &this); 3175 if (!path.dentry) 3176 goto put_memory; 3177 d_set_d_op(path.dentry, &anon_ops); 3178 3179 res = ERR_PTR(-ENOSPC); 3180 inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); 3181 if (!inode) 3182 goto put_memory; 3183 3184 inode->i_flags |= i_flags; 3185 d_instantiate(path.dentry, inode); 3186 inode->i_size = size; 3187 clear_nlink(inode); /* It is unlinked */ 3188 res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); 3189 if (IS_ERR(res)) 3190 goto put_path; 3191 3192 res = alloc_file(&path, FMODE_WRITE | FMODE_READ, 3193 &shmem_file_operations); 3194 if (IS_ERR(res)) 3195 goto put_path; 3196 3197 return res; 3198 3199put_memory: 3200 shmem_unacct_size(flags, size); 3201put_path: 3202 path_put(&path); 3203 return res; 3204} 3205 3206/** 3207 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be 3208 * kernel internal. There will be NO LSM permission checks against the 3209 * underlying inode. So users of this interface must do LSM checks at a 3210 * higher layer. The one user is the big_key implementation. LSM checks 3211 * are provided at the key level rather than the inode level. 3212 * @name: name for dentry (to be seen in /proc/<pid>/maps 3213 * @size: size to be set for the file 3214 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 3215 */ 3216struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) 3217{ 3218 return __shmem_file_setup(name, size, flags, S_PRIVATE); 3219} 3220 3221/** 3222 * shmem_file_setup - get an unlinked file living in tmpfs 3223 * @name: name for dentry (to be seen in /proc/<pid>/maps 3224 * @size: size to be set for the file 3225 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 3226 */ 3227struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) 3228{ 3229 return __shmem_file_setup(name, size, flags, 0); 3230} 3231EXPORT_SYMBOL_GPL(shmem_file_setup); 3232 3233/** 3234 * shmem_zero_setup - setup a shared anonymous mapping 3235 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff 3236 */ 3237int shmem_zero_setup(struct vm_area_struct *vma) 3238{ 3239 struct file *file; 3240 loff_t size = vma->vm_end - vma->vm_start; 3241 3242 file = shmem_file_setup("dev/zero", size, vma->vm_flags); 3243 if (IS_ERR(file)) 3244 return PTR_ERR(file); 3245 3246 if (vma->vm_file) 3247 fput(vma->vm_file); 3248 vma->vm_file = file; 3249 vma->vm_ops = &shmem_vm_ops; 3250 return 0; 3251} 3252 3253/** 3254 * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. 3255 * @mapping: the page's address_space 3256 * @index: the page index 3257 * @gfp: the page allocator flags to use if allocating 3258 * 3259 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", 3260 * with any new page allocations done using the specified allocation flags. 3261 * But read_cache_page_gfp() uses the ->readpage() method: which does not 3262 * suit tmpfs, since it may have pages in swapcache, and needs to find those 3263 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. 3264 * 3265 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in 3266 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. 3267 */ 3268struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 3269 pgoff_t index, gfp_t gfp) 3270{ 3271#ifdef CONFIG_SHMEM 3272 struct inode *inode = mapping->host; 3273 struct page *page; 3274 int error; 3275 3276 BUG_ON(mapping->a_ops != &shmem_aops); 3277 error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL); 3278 if (error) 3279 page = ERR_PTR(error); 3280 else 3281 unlock_page(page); 3282 return page; 3283#else 3284 /* 3285 * The tiny !SHMEM case uses ramfs without swap 3286 */ 3287 return read_cache_page_gfp(mapping, index, gfp); 3288#endif 3289} 3290EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); 3291