shmem.c revision 285b2c4fdd69ea73b4762785d8c6be83b6c074a6
1/* 2 * Resizable virtual memory filesystem for Linux. 3 * 4 * Copyright (C) 2000 Linus Torvalds. 5 * 2000 Transmeta Corp. 6 * 2000-2001 Christoph Rohland 7 * 2000-2001 SAP AG 8 * 2002 Red Hat Inc. 9 * Copyright (C) 2002-2005 Hugh Dickins. 10 * Copyright (C) 2002-2005 VERITAS Software Corporation. 11 * Copyright (C) 2004 Andi Kleen, SuSE Labs 12 * 13 * Extended attribute support for tmpfs: 14 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> 15 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> 16 * 17 * tiny-shmem: 18 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> 19 * 20 * This file is released under the GPL. 21 */ 22 23#include <linux/fs.h> 24#include <linux/init.h> 25#include <linux/vfs.h> 26#include <linux/mount.h> 27#include <linux/pagemap.h> 28#include <linux/file.h> 29#include <linux/mm.h> 30#include <linux/module.h> 31#include <linux/percpu_counter.h> 32#include <linux/swap.h> 33 34static struct vfsmount *shm_mnt; 35 36#ifdef CONFIG_SHMEM 37/* 38 * This virtual memory filesystem is heavily based on the ramfs. It 39 * extends ramfs by the ability to use swap and honor resource limits 40 * which makes it a completely usable filesystem. 41 */ 42 43#include <linux/xattr.h> 44#include <linux/exportfs.h> 45#include <linux/posix_acl.h> 46#include <linux/generic_acl.h> 47#include <linux/mman.h> 48#include <linux/string.h> 49#include <linux/slab.h> 50#include <linux/backing-dev.h> 51#include <linux/shmem_fs.h> 52#include <linux/writeback.h> 53#include <linux/blkdev.h> 54#include <linux/splice.h> 55#include <linux/security.h> 56#include <linux/swapops.h> 57#include <linux/mempolicy.h> 58#include <linux/namei.h> 59#include <linux/ctype.h> 60#include <linux/migrate.h> 61#include <linux/highmem.h> 62#include <linux/seq_file.h> 63#include <linux/magic.h> 64 65#include <asm/uaccess.h> 66#include <asm/div64.h> 67#include <asm/pgtable.h> 68 69#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) 70#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) 71 72/* Pretend that each entry is of this size in directory's i_size */ 73#define BOGO_DIRENT_SIZE 20 74 75struct shmem_xattr { 76 struct list_head list; /* anchored by shmem_inode_info->xattr_list */ 77 char *name; /* xattr name */ 78 size_t size; 79 char value[0]; 80}; 81 82/* Flag allocation requirements to shmem_getpage */ 83enum sgp_type { 84 SGP_READ, /* don't exceed i_size, don't allocate page */ 85 SGP_CACHE, /* don't exceed i_size, may allocate page */ 86 SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ 87 SGP_WRITE, /* may exceed i_size, may allocate page */ 88}; 89 90#ifdef CONFIG_TMPFS 91static unsigned long shmem_default_max_blocks(void) 92{ 93 return totalram_pages / 2; 94} 95 96static unsigned long shmem_default_max_inodes(void) 97{ 98 return min(totalram_pages - totalhigh_pages, totalram_pages / 2); 99} 100#endif 101 102static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 103 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); 104 105static inline int shmem_getpage(struct inode *inode, pgoff_t index, 106 struct page **pagep, enum sgp_type sgp, int *fault_type) 107{ 108 return shmem_getpage_gfp(inode, index, pagep, sgp, 109 mapping_gfp_mask(inode->i_mapping), fault_type); 110} 111 112static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) 113{ 114 return sb->s_fs_info; 115} 116 117/* 118 * shmem_file_setup pre-accounts the whole fixed size of a VM object, 119 * for shared memory and for shared anonymous (/dev/zero) mappings 120 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), 121 * consistent with the pre-accounting of private mappings ... 122 */ 123static inline int shmem_acct_size(unsigned long flags, loff_t size) 124{ 125 return (flags & VM_NORESERVE) ? 126 0 : security_vm_enough_memory_kern(VM_ACCT(size)); 127} 128 129static inline void shmem_unacct_size(unsigned long flags, loff_t size) 130{ 131 if (!(flags & VM_NORESERVE)) 132 vm_unacct_memory(VM_ACCT(size)); 133} 134 135/* 136 * ... whereas tmpfs objects are accounted incrementally as 137 * pages are allocated, in order to allow huge sparse files. 138 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, 139 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. 140 */ 141static inline int shmem_acct_block(unsigned long flags) 142{ 143 return (flags & VM_NORESERVE) ? 144 security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)) : 0; 145} 146 147static inline void shmem_unacct_blocks(unsigned long flags, long pages) 148{ 149 if (flags & VM_NORESERVE) 150 vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE)); 151} 152 153static const struct super_operations shmem_ops; 154static const struct address_space_operations shmem_aops; 155static const struct file_operations shmem_file_operations; 156static const struct inode_operations shmem_inode_operations; 157static const struct inode_operations shmem_dir_inode_operations; 158static const struct inode_operations shmem_special_inode_operations; 159static const struct vm_operations_struct shmem_vm_ops; 160 161static struct backing_dev_info shmem_backing_dev_info __read_mostly = { 162 .ra_pages = 0, /* No readahead */ 163 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 164}; 165 166static LIST_HEAD(shmem_swaplist); 167static DEFINE_MUTEX(shmem_swaplist_mutex); 168 169static void shmem_free_blocks(struct inode *inode, long pages) 170{ 171 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 172 if (sbinfo->max_blocks) { 173 percpu_counter_add(&sbinfo->used_blocks, -pages); 174 inode->i_blocks -= pages*BLOCKS_PER_PAGE; 175 } 176} 177 178static int shmem_reserve_inode(struct super_block *sb) 179{ 180 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 181 if (sbinfo->max_inodes) { 182 spin_lock(&sbinfo->stat_lock); 183 if (!sbinfo->free_inodes) { 184 spin_unlock(&sbinfo->stat_lock); 185 return -ENOSPC; 186 } 187 sbinfo->free_inodes--; 188 spin_unlock(&sbinfo->stat_lock); 189 } 190 return 0; 191} 192 193static void shmem_free_inode(struct super_block *sb) 194{ 195 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 196 if (sbinfo->max_inodes) { 197 spin_lock(&sbinfo->stat_lock); 198 sbinfo->free_inodes++; 199 spin_unlock(&sbinfo->stat_lock); 200 } 201} 202 203/** 204 * shmem_recalc_inode - recalculate the size of an inode 205 * @inode: inode to recalc 206 * 207 * We have to calculate the free blocks since the mm can drop 208 * undirtied hole pages behind our back. 209 * 210 * But normally info->alloced == inode->i_mapping->nrpages + info->swapped 211 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) 212 * 213 * It has to be called with the spinlock held. 214 */ 215static void shmem_recalc_inode(struct inode *inode) 216{ 217 struct shmem_inode_info *info = SHMEM_I(inode); 218 long freed; 219 220 freed = info->alloced - info->swapped - inode->i_mapping->nrpages; 221 if (freed > 0) { 222 info->alloced -= freed; 223 shmem_unacct_blocks(info->flags, freed); 224 shmem_free_blocks(inode, freed); 225 } 226} 227 228static void shmem_put_swap(struct shmem_inode_info *info, pgoff_t index, 229 swp_entry_t swap) 230{ 231 if (index < SHMEM_NR_DIRECT) 232 info->i_direct[index] = swap; 233} 234 235static swp_entry_t shmem_get_swap(struct shmem_inode_info *info, pgoff_t index) 236{ 237 return (index < SHMEM_NR_DIRECT) ? 238 info->i_direct[index] : (swp_entry_t){0}; 239} 240 241void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 242{ 243 struct address_space *mapping = inode->i_mapping; 244 struct shmem_inode_info *info = SHMEM_I(inode); 245 pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 246 pgoff_t end = (lend >> PAGE_CACHE_SHIFT); 247 pgoff_t index; 248 swp_entry_t swap; 249 250 truncate_inode_pages_range(mapping, lstart, lend); 251 252 if (end > SHMEM_NR_DIRECT) 253 end = SHMEM_NR_DIRECT; 254 255 spin_lock(&info->lock); 256 for (index = start; index < end; index++) { 257 swap = shmem_get_swap(info, index); 258 if (swap.val) { 259 free_swap_and_cache(swap); 260 shmem_put_swap(info, index, (swp_entry_t){0}); 261 info->swapped--; 262 } 263 } 264 265 if (mapping->nrpages) { 266 spin_unlock(&info->lock); 267 /* 268 * A page may have meanwhile sneaked in from swap. 269 */ 270 truncate_inode_pages_range(mapping, lstart, lend); 271 spin_lock(&info->lock); 272 } 273 274 shmem_recalc_inode(inode); 275 spin_unlock(&info->lock); 276 277 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 278} 279EXPORT_SYMBOL_GPL(shmem_truncate_range); 280 281static int shmem_setattr(struct dentry *dentry, struct iattr *attr) 282{ 283 struct inode *inode = dentry->d_inode; 284 int error; 285 286 error = inode_change_ok(inode, attr); 287 if (error) 288 return error; 289 290 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 291 loff_t oldsize = inode->i_size; 292 loff_t newsize = attr->ia_size; 293 struct page *page = NULL; 294 295 if (newsize < oldsize) { 296 /* 297 * If truncating down to a partial page, then 298 * if that page is already allocated, hold it 299 * in memory until the truncation is over, so 300 * truncate_partial_page cannot miss it were 301 * it assigned to swap. 302 */ 303 if (newsize & (PAGE_CACHE_SIZE-1)) { 304 (void) shmem_getpage(inode, 305 newsize >> PAGE_CACHE_SHIFT, 306 &page, SGP_READ, NULL); 307 if (page) 308 unlock_page(page); 309 } 310 } 311 if (newsize != oldsize) { 312 i_size_write(inode, newsize); 313 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 314 } 315 if (newsize < oldsize) { 316 loff_t holebegin = round_up(newsize, PAGE_SIZE); 317 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); 318 shmem_truncate_range(inode, newsize, (loff_t)-1); 319 /* unmap again to remove racily COWed private pages */ 320 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); 321 } 322 if (page) 323 page_cache_release(page); 324 } 325 326 setattr_copy(inode, attr); 327#ifdef CONFIG_TMPFS_POSIX_ACL 328 if (attr->ia_valid & ATTR_MODE) 329 error = generic_acl_chmod(inode); 330#endif 331 return error; 332} 333 334static void shmem_evict_inode(struct inode *inode) 335{ 336 struct shmem_inode_info *info = SHMEM_I(inode); 337 struct shmem_xattr *xattr, *nxattr; 338 339 if (inode->i_mapping->a_ops == &shmem_aops) { 340 shmem_unacct_size(info->flags, inode->i_size); 341 inode->i_size = 0; 342 shmem_truncate_range(inode, 0, (loff_t)-1); 343 if (!list_empty(&info->swaplist)) { 344 mutex_lock(&shmem_swaplist_mutex); 345 list_del_init(&info->swaplist); 346 mutex_unlock(&shmem_swaplist_mutex); 347 } 348 } 349 350 list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { 351 kfree(xattr->name); 352 kfree(xattr); 353 } 354 BUG_ON(inode->i_blocks); 355 shmem_free_inode(inode->i_sb); 356 end_writeback(inode); 357} 358 359static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) 360{ 361 struct address_space *mapping = info->vfs_inode.i_mapping; 362 unsigned long idx; 363 int error; 364 365 for (idx = 0; idx < SHMEM_NR_DIRECT; idx++) 366 if (shmem_get_swap(info, idx).val == entry.val) 367 goto found; 368 return 0; 369found: 370 spin_lock(&info->lock); 371 if (shmem_get_swap(info, idx).val != entry.val) { 372 spin_unlock(&info->lock); 373 return 0; 374 } 375 376 /* 377 * Move _head_ to start search for next from here. 378 * But be careful: shmem_evict_inode checks list_empty without taking 379 * mutex, and there's an instant in list_move_tail when info->swaplist 380 * would appear empty, if it were the only one on shmem_swaplist. 381 */ 382 if (shmem_swaplist.next != &info->swaplist) 383 list_move_tail(&shmem_swaplist, &info->swaplist); 384 385 /* 386 * We rely on shmem_swaplist_mutex, not only to protect the swaplist, 387 * but also to hold up shmem_evict_inode(): so inode cannot be freed 388 * beneath us (pagelock doesn't help until the page is in pagecache). 389 */ 390 error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); 391 /* which does mem_cgroup_uncharge_cache_page on error */ 392 393 if (error != -ENOMEM) { 394 delete_from_swap_cache(page); 395 set_page_dirty(page); 396 shmem_put_swap(info, idx, (swp_entry_t){0}); 397 info->swapped--; 398 swap_free(entry); 399 error = 1; /* not an error, but entry was found */ 400 } 401 spin_unlock(&info->lock); 402 return error; 403} 404 405/* 406 * shmem_unuse() search for an eventually swapped out shmem page. 407 */ 408int shmem_unuse(swp_entry_t entry, struct page *page) 409{ 410 struct list_head *p, *next; 411 struct shmem_inode_info *info; 412 int found = 0; 413 int error; 414 415 /* 416 * Charge page using GFP_KERNEL while we can wait, before taking 417 * the shmem_swaplist_mutex which might hold up shmem_writepage(). 418 * Charged back to the user (not to caller) when swap account is used. 419 * add_to_page_cache() will be called with GFP_NOWAIT. 420 */ 421 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); 422 if (error) 423 goto out; 424 /* 425 * Try to preload while we can wait, to not make a habit of 426 * draining atomic reserves; but don't latch on to this cpu, 427 * it's okay if sometimes we get rescheduled after this. 428 */ 429 error = radix_tree_preload(GFP_KERNEL); 430 if (error) 431 goto uncharge; 432 radix_tree_preload_end(); 433 434 mutex_lock(&shmem_swaplist_mutex); 435 list_for_each_safe(p, next, &shmem_swaplist) { 436 info = list_entry(p, struct shmem_inode_info, swaplist); 437 if (!info->swapped) { 438 spin_lock(&info->lock); 439 if (!info->swapped) 440 list_del_init(&info->swaplist); 441 spin_unlock(&info->lock); 442 } 443 if (info->swapped) 444 found = shmem_unuse_inode(info, entry, page); 445 cond_resched(); 446 if (found) 447 break; 448 } 449 mutex_unlock(&shmem_swaplist_mutex); 450 451uncharge: 452 if (!found) 453 mem_cgroup_uncharge_cache_page(page); 454 if (found < 0) 455 error = found; 456out: 457 unlock_page(page); 458 page_cache_release(page); 459 return error; 460} 461 462/* 463 * Move the page from the page cache to the swap cache. 464 */ 465static int shmem_writepage(struct page *page, struct writeback_control *wbc) 466{ 467 struct shmem_inode_info *info; 468 swp_entry_t swap, oswap; 469 struct address_space *mapping; 470 unsigned long index; 471 struct inode *inode; 472 473 BUG_ON(!PageLocked(page)); 474 mapping = page->mapping; 475 index = page->index; 476 inode = mapping->host; 477 info = SHMEM_I(inode); 478 if (info->flags & VM_LOCKED) 479 goto redirty; 480 if (!total_swap_pages) 481 goto redirty; 482 483 /* 484 * shmem_backing_dev_info's capabilities prevent regular writeback or 485 * sync from ever calling shmem_writepage; but a stacking filesystem 486 * might use ->writepage of its underlying filesystem, in which case 487 * tmpfs should write out to swap only in response to memory pressure, 488 * and not for the writeback threads or sync. 489 */ 490 if (!wbc->for_reclaim) { 491 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ 492 goto redirty; 493 } 494 495 /* 496 * Just for this patch, we have a toy implementation, 497 * which can swap out only the first SHMEM_NR_DIRECT pages: 498 * for simple demonstration of where we need to think about swap. 499 */ 500 if (index >= SHMEM_NR_DIRECT) 501 goto redirty; 502 503 swap = get_swap_page(); 504 if (!swap.val) 505 goto redirty; 506 507 /* 508 * Add inode to shmem_unuse()'s list of swapped-out inodes, 509 * if it's not already there. Do it now because we cannot take 510 * mutex while holding spinlock, and must do so before the page 511 * is moved to swap cache, when its pagelock no longer protects 512 * the inode from eviction. But don't unlock the mutex until 513 * we've taken the spinlock, because shmem_unuse_inode() will 514 * prune a !swapped inode from the swaplist under both locks. 515 */ 516 mutex_lock(&shmem_swaplist_mutex); 517 if (list_empty(&info->swaplist)) 518 list_add_tail(&info->swaplist, &shmem_swaplist); 519 520 spin_lock(&info->lock); 521 mutex_unlock(&shmem_swaplist_mutex); 522 523 oswap = shmem_get_swap(info, index); 524 if (oswap.val) { 525 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ 526 free_swap_and_cache(oswap); 527 shmem_put_swap(info, index, (swp_entry_t){0}); 528 info->swapped--; 529 } 530 shmem_recalc_inode(inode); 531 532 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 533 delete_from_page_cache(page); 534 shmem_put_swap(info, index, swap); 535 info->swapped++; 536 swap_shmem_alloc(swap); 537 spin_unlock(&info->lock); 538 BUG_ON(page_mapped(page)); 539 swap_writepage(page, wbc); 540 return 0; 541 } 542 543 spin_unlock(&info->lock); 544 swapcache_free(swap, NULL); 545redirty: 546 set_page_dirty(page); 547 if (wbc->for_reclaim) 548 return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */ 549 unlock_page(page); 550 return 0; 551} 552 553#ifdef CONFIG_NUMA 554#ifdef CONFIG_TMPFS 555static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 556{ 557 char buffer[64]; 558 559 if (!mpol || mpol->mode == MPOL_DEFAULT) 560 return; /* show nothing */ 561 562 mpol_to_str(buffer, sizeof(buffer), mpol, 1); 563 564 seq_printf(seq, ",mpol=%s", buffer); 565} 566 567static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 568{ 569 struct mempolicy *mpol = NULL; 570 if (sbinfo->mpol) { 571 spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ 572 mpol = sbinfo->mpol; 573 mpol_get(mpol); 574 spin_unlock(&sbinfo->stat_lock); 575 } 576 return mpol; 577} 578#endif /* CONFIG_TMPFS */ 579 580static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, 581 struct shmem_inode_info *info, unsigned long idx) 582{ 583 struct mempolicy mpol, *spol; 584 struct vm_area_struct pvma; 585 struct page *page; 586 587 spol = mpol_cond_copy(&mpol, 588 mpol_shared_policy_lookup(&info->policy, idx)); 589 590 /* Create a pseudo vma that just contains the policy */ 591 pvma.vm_start = 0; 592 pvma.vm_pgoff = idx; 593 pvma.vm_ops = NULL; 594 pvma.vm_policy = spol; 595 page = swapin_readahead(entry, gfp, &pvma, 0); 596 return page; 597} 598 599static struct page *shmem_alloc_page(gfp_t gfp, 600 struct shmem_inode_info *info, unsigned long idx) 601{ 602 struct vm_area_struct pvma; 603 604 /* Create a pseudo vma that just contains the policy */ 605 pvma.vm_start = 0; 606 pvma.vm_pgoff = idx; 607 pvma.vm_ops = NULL; 608 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); 609 610 /* 611 * alloc_page_vma() will drop the shared policy reference 612 */ 613 return alloc_page_vma(gfp, &pvma, 0); 614} 615#else /* !CONFIG_NUMA */ 616#ifdef CONFIG_TMPFS 617static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p) 618{ 619} 620#endif /* CONFIG_TMPFS */ 621 622static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, 623 struct shmem_inode_info *info, unsigned long idx) 624{ 625 return swapin_readahead(entry, gfp, NULL, 0); 626} 627 628static inline struct page *shmem_alloc_page(gfp_t gfp, 629 struct shmem_inode_info *info, unsigned long idx) 630{ 631 return alloc_page(gfp); 632} 633#endif /* CONFIG_NUMA */ 634 635#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS) 636static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 637{ 638 return NULL; 639} 640#endif 641 642/* 643 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate 644 * 645 * If we allocate a new one we do not mark it dirty. That's up to the 646 * vm. If we swap it in we mark it dirty since we also free the swap 647 * entry since a page cannot live in both the swap and page cache 648 */ 649static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx, 650 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) 651{ 652 struct address_space *mapping = inode->i_mapping; 653 struct shmem_inode_info *info = SHMEM_I(inode); 654 struct shmem_sb_info *sbinfo; 655 struct page *page; 656 struct page *prealloc_page = NULL; 657 swp_entry_t swap; 658 int error; 659 660 if (idx > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) 661 return -EFBIG; 662repeat: 663 page = find_lock_page(mapping, idx); 664 if (page) { 665 /* 666 * Once we can get the page lock, it must be uptodate: 667 * if there were an error in reading back from swap, 668 * the page would not be inserted into the filecache. 669 */ 670 BUG_ON(!PageUptodate(page)); 671 goto done; 672 } 673 674 /* 675 * Try to preload while we can wait, to not make a habit of 676 * draining atomic reserves; but don't latch on to this cpu. 677 */ 678 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); 679 if (error) 680 goto out; 681 radix_tree_preload_end(); 682 683 if (sgp != SGP_READ && !prealloc_page) { 684 prealloc_page = shmem_alloc_page(gfp, info, idx); 685 if (prealloc_page) { 686 SetPageSwapBacked(prealloc_page); 687 if (mem_cgroup_cache_charge(prealloc_page, 688 current->mm, GFP_KERNEL)) { 689 page_cache_release(prealloc_page); 690 prealloc_page = NULL; 691 } 692 } 693 } 694 695 spin_lock(&info->lock); 696 shmem_recalc_inode(inode); 697 swap = shmem_get_swap(info, idx); 698 if (swap.val) { 699 /* Look it up and read it in.. */ 700 page = lookup_swap_cache(swap); 701 if (!page) { 702 spin_unlock(&info->lock); 703 /* here we actually do the io */ 704 if (fault_type) 705 *fault_type |= VM_FAULT_MAJOR; 706 page = shmem_swapin(swap, gfp, info, idx); 707 if (!page) { 708 swp_entry_t nswap = shmem_get_swap(info, idx); 709 if (nswap.val == swap.val) { 710 error = -ENOMEM; 711 goto out; 712 } 713 goto repeat; 714 } 715 wait_on_page_locked(page); 716 page_cache_release(page); 717 goto repeat; 718 } 719 720 /* We have to do this with page locked to prevent races */ 721 if (!trylock_page(page)) { 722 spin_unlock(&info->lock); 723 wait_on_page_locked(page); 724 page_cache_release(page); 725 goto repeat; 726 } 727 if (PageWriteback(page)) { 728 spin_unlock(&info->lock); 729 wait_on_page_writeback(page); 730 unlock_page(page); 731 page_cache_release(page); 732 goto repeat; 733 } 734 if (!PageUptodate(page)) { 735 spin_unlock(&info->lock); 736 unlock_page(page); 737 page_cache_release(page); 738 error = -EIO; 739 goto out; 740 } 741 742 error = add_to_page_cache_locked(page, mapping, 743 idx, GFP_NOWAIT); 744 if (error) { 745 spin_unlock(&info->lock); 746 if (error == -ENOMEM) { 747 /* 748 * reclaim from proper memory cgroup and 749 * call memcg's OOM if needed. 750 */ 751 error = mem_cgroup_shmem_charge_fallback( 752 page, current->mm, gfp); 753 if (error) { 754 unlock_page(page); 755 page_cache_release(page); 756 goto out; 757 } 758 } 759 unlock_page(page); 760 page_cache_release(page); 761 goto repeat; 762 } 763 764 delete_from_swap_cache(page); 765 shmem_put_swap(info, idx, (swp_entry_t){0}); 766 info->swapped--; 767 spin_unlock(&info->lock); 768 set_page_dirty(page); 769 swap_free(swap); 770 771 } else if (sgp == SGP_READ) { 772 page = find_get_page(mapping, idx); 773 if (page && !trylock_page(page)) { 774 spin_unlock(&info->lock); 775 wait_on_page_locked(page); 776 page_cache_release(page); 777 goto repeat; 778 } 779 spin_unlock(&info->lock); 780 781 } else if (prealloc_page) { 782 sbinfo = SHMEM_SB(inode->i_sb); 783 if (sbinfo->max_blocks) { 784 if (percpu_counter_compare(&sbinfo->used_blocks, 785 sbinfo->max_blocks) >= 0 || 786 shmem_acct_block(info->flags)) 787 goto nospace; 788 percpu_counter_inc(&sbinfo->used_blocks); 789 inode->i_blocks += BLOCKS_PER_PAGE; 790 } else if (shmem_acct_block(info->flags)) 791 goto nospace; 792 793 page = prealloc_page; 794 prealloc_page = NULL; 795 796 swap = shmem_get_swap(info, idx); 797 if (swap.val) 798 mem_cgroup_uncharge_cache_page(page); 799 else 800 error = add_to_page_cache_lru(page, mapping, 801 idx, GFP_NOWAIT); 802 /* 803 * At add_to_page_cache_lru() failure, 804 * uncharge will be done automatically. 805 */ 806 if (swap.val || error) { 807 shmem_unacct_blocks(info->flags, 1); 808 shmem_free_blocks(inode, 1); 809 spin_unlock(&info->lock); 810 page_cache_release(page); 811 goto repeat; 812 } 813 814 info->alloced++; 815 spin_unlock(&info->lock); 816 clear_highpage(page); 817 flush_dcache_page(page); 818 SetPageUptodate(page); 819 if (sgp == SGP_DIRTY) 820 set_page_dirty(page); 821 822 } else { 823 spin_unlock(&info->lock); 824 error = -ENOMEM; 825 goto out; 826 } 827done: 828 *pagep = page; 829 error = 0; 830out: 831 if (prealloc_page) { 832 mem_cgroup_uncharge_cache_page(prealloc_page); 833 page_cache_release(prealloc_page); 834 } 835 return error; 836 837nospace: 838 /* 839 * Perhaps the page was brought in from swap between find_lock_page 840 * and taking info->lock? We allow for that at add_to_page_cache_lru, 841 * but must also avoid reporting a spurious ENOSPC while working on a 842 * full tmpfs. 843 */ 844 page = find_get_page(mapping, idx); 845 spin_unlock(&info->lock); 846 if (page) { 847 page_cache_release(page); 848 goto repeat; 849 } 850 error = -ENOSPC; 851 goto out; 852} 853 854static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 855{ 856 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 857 int error; 858 int ret = VM_FAULT_LOCKED; 859 860 if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) 861 return VM_FAULT_SIGBUS; 862 863 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); 864 if (error) 865 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 866 867 if (ret & VM_FAULT_MAJOR) { 868 count_vm_event(PGMAJFAULT); 869 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 870 } 871 return ret; 872} 873 874#ifdef CONFIG_NUMA 875static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) 876{ 877 struct inode *i = vma->vm_file->f_path.dentry->d_inode; 878 return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); 879} 880 881static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, 882 unsigned long addr) 883{ 884 struct inode *i = vma->vm_file->f_path.dentry->d_inode; 885 unsigned long idx; 886 887 idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 888 return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx); 889} 890#endif 891 892int shmem_lock(struct file *file, int lock, struct user_struct *user) 893{ 894 struct inode *inode = file->f_path.dentry->d_inode; 895 struct shmem_inode_info *info = SHMEM_I(inode); 896 int retval = -ENOMEM; 897 898 spin_lock(&info->lock); 899 if (lock && !(info->flags & VM_LOCKED)) { 900 if (!user_shm_lock(inode->i_size, user)) 901 goto out_nomem; 902 info->flags |= VM_LOCKED; 903 mapping_set_unevictable(file->f_mapping); 904 } 905 if (!lock && (info->flags & VM_LOCKED) && user) { 906 user_shm_unlock(inode->i_size, user); 907 info->flags &= ~VM_LOCKED; 908 mapping_clear_unevictable(file->f_mapping); 909 scan_mapping_unevictable_pages(file->f_mapping); 910 } 911 retval = 0; 912 913out_nomem: 914 spin_unlock(&info->lock); 915 return retval; 916} 917 918static int shmem_mmap(struct file *file, struct vm_area_struct *vma) 919{ 920 file_accessed(file); 921 vma->vm_ops = &shmem_vm_ops; 922 vma->vm_flags |= VM_CAN_NONLINEAR; 923 return 0; 924} 925 926static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, 927 int mode, dev_t dev, unsigned long flags) 928{ 929 struct inode *inode; 930 struct shmem_inode_info *info; 931 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 932 933 if (shmem_reserve_inode(sb)) 934 return NULL; 935 936 inode = new_inode(sb); 937 if (inode) { 938 inode->i_ino = get_next_ino(); 939 inode_init_owner(inode, dir, mode); 940 inode->i_blocks = 0; 941 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; 942 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 943 inode->i_generation = get_seconds(); 944 info = SHMEM_I(inode); 945 memset(info, 0, (char *)inode - (char *)info); 946 spin_lock_init(&info->lock); 947 info->flags = flags & VM_NORESERVE; 948 INIT_LIST_HEAD(&info->swaplist); 949 INIT_LIST_HEAD(&info->xattr_list); 950 cache_no_acl(inode); 951 952 switch (mode & S_IFMT) { 953 default: 954 inode->i_op = &shmem_special_inode_operations; 955 init_special_inode(inode, mode, dev); 956 break; 957 case S_IFREG: 958 inode->i_mapping->a_ops = &shmem_aops; 959 inode->i_op = &shmem_inode_operations; 960 inode->i_fop = &shmem_file_operations; 961 mpol_shared_policy_init(&info->policy, 962 shmem_get_sbmpol(sbinfo)); 963 break; 964 case S_IFDIR: 965 inc_nlink(inode); 966 /* Some things misbehave if size == 0 on a directory */ 967 inode->i_size = 2 * BOGO_DIRENT_SIZE; 968 inode->i_op = &shmem_dir_inode_operations; 969 inode->i_fop = &simple_dir_operations; 970 break; 971 case S_IFLNK: 972 /* 973 * Must not load anything in the rbtree, 974 * mpol_free_shared_policy will not be called. 975 */ 976 mpol_shared_policy_init(&info->policy, NULL); 977 break; 978 } 979 } else 980 shmem_free_inode(sb); 981 return inode; 982} 983 984#ifdef CONFIG_TMPFS 985static const struct inode_operations shmem_symlink_inode_operations; 986static const struct inode_operations shmem_symlink_inline_operations; 987 988static int 989shmem_write_begin(struct file *file, struct address_space *mapping, 990 loff_t pos, unsigned len, unsigned flags, 991 struct page **pagep, void **fsdata) 992{ 993 struct inode *inode = mapping->host; 994 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 995 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); 996} 997 998static int 999shmem_write_end(struct file *file, struct address_space *mapping, 1000 loff_t pos, unsigned len, unsigned copied, 1001 struct page *page, void *fsdata) 1002{ 1003 struct inode *inode = mapping->host; 1004 1005 if (pos + copied > inode->i_size) 1006 i_size_write(inode, pos + copied); 1007 1008 set_page_dirty(page); 1009 unlock_page(page); 1010 page_cache_release(page); 1011 1012 return copied; 1013} 1014 1015static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) 1016{ 1017 struct inode *inode = filp->f_path.dentry->d_inode; 1018 struct address_space *mapping = inode->i_mapping; 1019 unsigned long index, offset; 1020 enum sgp_type sgp = SGP_READ; 1021 1022 /* 1023 * Might this read be for a stacking filesystem? Then when reading 1024 * holes of a sparse file, we actually need to allocate those pages, 1025 * and even mark them dirty, so it cannot exceed the max_blocks limit. 1026 */ 1027 if (segment_eq(get_fs(), KERNEL_DS)) 1028 sgp = SGP_DIRTY; 1029 1030 index = *ppos >> PAGE_CACHE_SHIFT; 1031 offset = *ppos & ~PAGE_CACHE_MASK; 1032 1033 for (;;) { 1034 struct page *page = NULL; 1035 unsigned long end_index, nr, ret; 1036 loff_t i_size = i_size_read(inode); 1037 1038 end_index = i_size >> PAGE_CACHE_SHIFT; 1039 if (index > end_index) 1040 break; 1041 if (index == end_index) { 1042 nr = i_size & ~PAGE_CACHE_MASK; 1043 if (nr <= offset) 1044 break; 1045 } 1046 1047 desc->error = shmem_getpage(inode, index, &page, sgp, NULL); 1048 if (desc->error) { 1049 if (desc->error == -EINVAL) 1050 desc->error = 0; 1051 break; 1052 } 1053 if (page) 1054 unlock_page(page); 1055 1056 /* 1057 * We must evaluate after, since reads (unlike writes) 1058 * are called without i_mutex protection against truncate 1059 */ 1060 nr = PAGE_CACHE_SIZE; 1061 i_size = i_size_read(inode); 1062 end_index = i_size >> PAGE_CACHE_SHIFT; 1063 if (index == end_index) { 1064 nr = i_size & ~PAGE_CACHE_MASK; 1065 if (nr <= offset) { 1066 if (page) 1067 page_cache_release(page); 1068 break; 1069 } 1070 } 1071 nr -= offset; 1072 1073 if (page) { 1074 /* 1075 * If users can be writing to this page using arbitrary 1076 * virtual addresses, take care about potential aliasing 1077 * before reading the page on the kernel side. 1078 */ 1079 if (mapping_writably_mapped(mapping)) 1080 flush_dcache_page(page); 1081 /* 1082 * Mark the page accessed if we read the beginning. 1083 */ 1084 if (!offset) 1085 mark_page_accessed(page); 1086 } else { 1087 page = ZERO_PAGE(0); 1088 page_cache_get(page); 1089 } 1090 1091 /* 1092 * Ok, we have the page, and it's up-to-date, so 1093 * now we can copy it to user space... 1094 * 1095 * The actor routine returns how many bytes were actually used.. 1096 * NOTE! This may not be the same as how much of a user buffer 1097 * we filled up (we may be padding etc), so we can only update 1098 * "pos" here (the actor routine has to update the user buffer 1099 * pointers and the remaining count). 1100 */ 1101 ret = actor(desc, page, offset, nr); 1102 offset += ret; 1103 index += offset >> PAGE_CACHE_SHIFT; 1104 offset &= ~PAGE_CACHE_MASK; 1105 1106 page_cache_release(page); 1107 if (ret != nr || !desc->count) 1108 break; 1109 1110 cond_resched(); 1111 } 1112 1113 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; 1114 file_accessed(filp); 1115} 1116 1117static ssize_t shmem_file_aio_read(struct kiocb *iocb, 1118 const struct iovec *iov, unsigned long nr_segs, loff_t pos) 1119{ 1120 struct file *filp = iocb->ki_filp; 1121 ssize_t retval; 1122 unsigned long seg; 1123 size_t count; 1124 loff_t *ppos = &iocb->ki_pos; 1125 1126 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); 1127 if (retval) 1128 return retval; 1129 1130 for (seg = 0; seg < nr_segs; seg++) { 1131 read_descriptor_t desc; 1132 1133 desc.written = 0; 1134 desc.arg.buf = iov[seg].iov_base; 1135 desc.count = iov[seg].iov_len; 1136 if (desc.count == 0) 1137 continue; 1138 desc.error = 0; 1139 do_shmem_file_read(filp, ppos, &desc, file_read_actor); 1140 retval += desc.written; 1141 if (desc.error) { 1142 retval = retval ?: desc.error; 1143 break; 1144 } 1145 if (desc.count > 0) 1146 break; 1147 } 1148 return retval; 1149} 1150 1151static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, 1152 struct pipe_inode_info *pipe, size_t len, 1153 unsigned int flags) 1154{ 1155 struct address_space *mapping = in->f_mapping; 1156 struct inode *inode = mapping->host; 1157 unsigned int loff, nr_pages, req_pages; 1158 struct page *pages[PIPE_DEF_BUFFERS]; 1159 struct partial_page partial[PIPE_DEF_BUFFERS]; 1160 struct page *page; 1161 pgoff_t index, end_index; 1162 loff_t isize, left; 1163 int error, page_nr; 1164 struct splice_pipe_desc spd = { 1165 .pages = pages, 1166 .partial = partial, 1167 .flags = flags, 1168 .ops = &page_cache_pipe_buf_ops, 1169 .spd_release = spd_release_page, 1170 }; 1171 1172 isize = i_size_read(inode); 1173 if (unlikely(*ppos >= isize)) 1174 return 0; 1175 1176 left = isize - *ppos; 1177 if (unlikely(left < len)) 1178 len = left; 1179 1180 if (splice_grow_spd(pipe, &spd)) 1181 return -ENOMEM; 1182 1183 index = *ppos >> PAGE_CACHE_SHIFT; 1184 loff = *ppos & ~PAGE_CACHE_MASK; 1185 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1186 nr_pages = min(req_pages, pipe->buffers); 1187 1188 spd.nr_pages = find_get_pages_contig(mapping, index, 1189 nr_pages, spd.pages); 1190 index += spd.nr_pages; 1191 error = 0; 1192 1193 while (spd.nr_pages < nr_pages) { 1194 error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); 1195 if (error) 1196 break; 1197 unlock_page(page); 1198 spd.pages[spd.nr_pages++] = page; 1199 index++; 1200 } 1201 1202 index = *ppos >> PAGE_CACHE_SHIFT; 1203 nr_pages = spd.nr_pages; 1204 spd.nr_pages = 0; 1205 1206 for (page_nr = 0; page_nr < nr_pages; page_nr++) { 1207 unsigned int this_len; 1208 1209 if (!len) 1210 break; 1211 1212 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 1213 page = spd.pages[page_nr]; 1214 1215 if (!PageUptodate(page) || page->mapping != mapping) { 1216 error = shmem_getpage(inode, index, &page, 1217 SGP_CACHE, NULL); 1218 if (error) 1219 break; 1220 unlock_page(page); 1221 page_cache_release(spd.pages[page_nr]); 1222 spd.pages[page_nr] = page; 1223 } 1224 1225 isize = i_size_read(inode); 1226 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 1227 if (unlikely(!isize || index > end_index)) 1228 break; 1229 1230 if (end_index == index) { 1231 unsigned int plen; 1232 1233 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; 1234 if (plen <= loff) 1235 break; 1236 1237 this_len = min(this_len, plen - loff); 1238 len = this_len; 1239 } 1240 1241 spd.partial[page_nr].offset = loff; 1242 spd.partial[page_nr].len = this_len; 1243 len -= this_len; 1244 loff = 0; 1245 spd.nr_pages++; 1246 index++; 1247 } 1248 1249 while (page_nr < nr_pages) 1250 page_cache_release(spd.pages[page_nr++]); 1251 1252 if (spd.nr_pages) 1253 error = splice_to_pipe(pipe, &spd); 1254 1255 splice_shrink_spd(pipe, &spd); 1256 1257 if (error > 0) { 1258 *ppos += error; 1259 file_accessed(in); 1260 } 1261 return error; 1262} 1263 1264static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 1265{ 1266 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 1267 1268 buf->f_type = TMPFS_MAGIC; 1269 buf->f_bsize = PAGE_CACHE_SIZE; 1270 buf->f_namelen = NAME_MAX; 1271 if (sbinfo->max_blocks) { 1272 buf->f_blocks = sbinfo->max_blocks; 1273 buf->f_bavail = buf->f_bfree = 1274 sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); 1275 } 1276 if (sbinfo->max_inodes) { 1277 buf->f_files = sbinfo->max_inodes; 1278 buf->f_ffree = sbinfo->free_inodes; 1279 } 1280 /* else leave those fields 0 like simple_statfs */ 1281 return 0; 1282} 1283 1284/* 1285 * File creation. Allocate an inode, and we're done.. 1286 */ 1287static int 1288shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) 1289{ 1290 struct inode *inode; 1291 int error = -ENOSPC; 1292 1293 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); 1294 if (inode) { 1295 error = security_inode_init_security(inode, dir, 1296 &dentry->d_name, NULL, 1297 NULL, NULL); 1298 if (error) { 1299 if (error != -EOPNOTSUPP) { 1300 iput(inode); 1301 return error; 1302 } 1303 } 1304#ifdef CONFIG_TMPFS_POSIX_ACL 1305 error = generic_acl_init(inode, dir); 1306 if (error) { 1307 iput(inode); 1308 return error; 1309 } 1310#else 1311 error = 0; 1312#endif 1313 dir->i_size += BOGO_DIRENT_SIZE; 1314 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1315 d_instantiate(dentry, inode); 1316 dget(dentry); /* Extra count - pin the dentry in core */ 1317 } 1318 return error; 1319} 1320 1321static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode) 1322{ 1323 int error; 1324 1325 if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) 1326 return error; 1327 inc_nlink(dir); 1328 return 0; 1329} 1330 1331static int shmem_create(struct inode *dir, struct dentry *dentry, int mode, 1332 struct nameidata *nd) 1333{ 1334 return shmem_mknod(dir, dentry, mode | S_IFREG, 0); 1335} 1336 1337/* 1338 * Link a file.. 1339 */ 1340static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 1341{ 1342 struct inode *inode = old_dentry->d_inode; 1343 int ret; 1344 1345 /* 1346 * No ordinary (disk based) filesystem counts links as inodes; 1347 * but each new link needs a new dentry, pinning lowmem, and 1348 * tmpfs dentries cannot be pruned until they are unlinked. 1349 */ 1350 ret = shmem_reserve_inode(inode->i_sb); 1351 if (ret) 1352 goto out; 1353 1354 dir->i_size += BOGO_DIRENT_SIZE; 1355 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1356 inc_nlink(inode); 1357 ihold(inode); /* New dentry reference */ 1358 dget(dentry); /* Extra pinning count for the created dentry */ 1359 d_instantiate(dentry, inode); 1360out: 1361 return ret; 1362} 1363 1364static int shmem_unlink(struct inode *dir, struct dentry *dentry) 1365{ 1366 struct inode *inode = dentry->d_inode; 1367 1368 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) 1369 shmem_free_inode(inode->i_sb); 1370 1371 dir->i_size -= BOGO_DIRENT_SIZE; 1372 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1373 drop_nlink(inode); 1374 dput(dentry); /* Undo the count from "create" - this does all the work */ 1375 return 0; 1376} 1377 1378static int shmem_rmdir(struct inode *dir, struct dentry *dentry) 1379{ 1380 if (!simple_empty(dentry)) 1381 return -ENOTEMPTY; 1382 1383 drop_nlink(dentry->d_inode); 1384 drop_nlink(dir); 1385 return shmem_unlink(dir, dentry); 1386} 1387 1388/* 1389 * The VFS layer already does all the dentry stuff for rename, 1390 * we just have to decrement the usage count for the target if 1391 * it exists so that the VFS layer correctly free's it when it 1392 * gets overwritten. 1393 */ 1394static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) 1395{ 1396 struct inode *inode = old_dentry->d_inode; 1397 int they_are_dirs = S_ISDIR(inode->i_mode); 1398 1399 if (!simple_empty(new_dentry)) 1400 return -ENOTEMPTY; 1401 1402 if (new_dentry->d_inode) { 1403 (void) shmem_unlink(new_dir, new_dentry); 1404 if (they_are_dirs) 1405 drop_nlink(old_dir); 1406 } else if (they_are_dirs) { 1407 drop_nlink(old_dir); 1408 inc_nlink(new_dir); 1409 } 1410 1411 old_dir->i_size -= BOGO_DIRENT_SIZE; 1412 new_dir->i_size += BOGO_DIRENT_SIZE; 1413 old_dir->i_ctime = old_dir->i_mtime = 1414 new_dir->i_ctime = new_dir->i_mtime = 1415 inode->i_ctime = CURRENT_TIME; 1416 return 0; 1417} 1418 1419static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname) 1420{ 1421 int error; 1422 int len; 1423 struct inode *inode; 1424 struct page *page; 1425 char *kaddr; 1426 struct shmem_inode_info *info; 1427 1428 len = strlen(symname) + 1; 1429 if (len > PAGE_CACHE_SIZE) 1430 return -ENAMETOOLONG; 1431 1432 inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); 1433 if (!inode) 1434 return -ENOSPC; 1435 1436 error = security_inode_init_security(inode, dir, &dentry->d_name, NULL, 1437 NULL, NULL); 1438 if (error) { 1439 if (error != -EOPNOTSUPP) { 1440 iput(inode); 1441 return error; 1442 } 1443 error = 0; 1444 } 1445 1446 info = SHMEM_I(inode); 1447 inode->i_size = len-1; 1448 if (len <= SHMEM_SYMLINK_INLINE_LEN) { 1449 /* do it inline */ 1450 memcpy(info->inline_symlink, symname, len); 1451 inode->i_op = &shmem_symlink_inline_operations; 1452 } else { 1453 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); 1454 if (error) { 1455 iput(inode); 1456 return error; 1457 } 1458 inode->i_mapping->a_ops = &shmem_aops; 1459 inode->i_op = &shmem_symlink_inode_operations; 1460 kaddr = kmap_atomic(page, KM_USER0); 1461 memcpy(kaddr, symname, len); 1462 kunmap_atomic(kaddr, KM_USER0); 1463 set_page_dirty(page); 1464 unlock_page(page); 1465 page_cache_release(page); 1466 } 1467 dir->i_size += BOGO_DIRENT_SIZE; 1468 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1469 d_instantiate(dentry, inode); 1470 dget(dentry); 1471 return 0; 1472} 1473 1474static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) 1475{ 1476 nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink); 1477 return NULL; 1478} 1479 1480static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) 1481{ 1482 struct page *page = NULL; 1483 int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); 1484 nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); 1485 if (page) 1486 unlock_page(page); 1487 return page; 1488} 1489 1490static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) 1491{ 1492 if (!IS_ERR(nd_get_link(nd))) { 1493 struct page *page = cookie; 1494 kunmap(page); 1495 mark_page_accessed(page); 1496 page_cache_release(page); 1497 } 1498} 1499 1500#ifdef CONFIG_TMPFS_XATTR 1501/* 1502 * Superblocks without xattr inode operations may get some security.* xattr 1503 * support from the LSM "for free". As soon as we have any other xattrs 1504 * like ACLs, we also need to implement the security.* handlers at 1505 * filesystem level, though. 1506 */ 1507 1508static int shmem_xattr_get(struct dentry *dentry, const char *name, 1509 void *buffer, size_t size) 1510{ 1511 struct shmem_inode_info *info; 1512 struct shmem_xattr *xattr; 1513 int ret = -ENODATA; 1514 1515 info = SHMEM_I(dentry->d_inode); 1516 1517 spin_lock(&info->lock); 1518 list_for_each_entry(xattr, &info->xattr_list, list) { 1519 if (strcmp(name, xattr->name)) 1520 continue; 1521 1522 ret = xattr->size; 1523 if (buffer) { 1524 if (size < xattr->size) 1525 ret = -ERANGE; 1526 else 1527 memcpy(buffer, xattr->value, xattr->size); 1528 } 1529 break; 1530 } 1531 spin_unlock(&info->lock); 1532 return ret; 1533} 1534 1535static int shmem_xattr_set(struct dentry *dentry, const char *name, 1536 const void *value, size_t size, int flags) 1537{ 1538 struct inode *inode = dentry->d_inode; 1539 struct shmem_inode_info *info = SHMEM_I(inode); 1540 struct shmem_xattr *xattr; 1541 struct shmem_xattr *new_xattr = NULL; 1542 size_t len; 1543 int err = 0; 1544 1545 /* value == NULL means remove */ 1546 if (value) { 1547 /* wrap around? */ 1548 len = sizeof(*new_xattr) + size; 1549 if (len <= sizeof(*new_xattr)) 1550 return -ENOMEM; 1551 1552 new_xattr = kmalloc(len, GFP_KERNEL); 1553 if (!new_xattr) 1554 return -ENOMEM; 1555 1556 new_xattr->name = kstrdup(name, GFP_KERNEL); 1557 if (!new_xattr->name) { 1558 kfree(new_xattr); 1559 return -ENOMEM; 1560 } 1561 1562 new_xattr->size = size; 1563 memcpy(new_xattr->value, value, size); 1564 } 1565 1566 spin_lock(&info->lock); 1567 list_for_each_entry(xattr, &info->xattr_list, list) { 1568 if (!strcmp(name, xattr->name)) { 1569 if (flags & XATTR_CREATE) { 1570 xattr = new_xattr; 1571 err = -EEXIST; 1572 } else if (new_xattr) { 1573 list_replace(&xattr->list, &new_xattr->list); 1574 } else { 1575 list_del(&xattr->list); 1576 } 1577 goto out; 1578 } 1579 } 1580 if (flags & XATTR_REPLACE) { 1581 xattr = new_xattr; 1582 err = -ENODATA; 1583 } else { 1584 list_add(&new_xattr->list, &info->xattr_list); 1585 xattr = NULL; 1586 } 1587out: 1588 spin_unlock(&info->lock); 1589 if (xattr) 1590 kfree(xattr->name); 1591 kfree(xattr); 1592 return err; 1593} 1594 1595 1596static const struct xattr_handler *shmem_xattr_handlers[] = { 1597#ifdef CONFIG_TMPFS_POSIX_ACL 1598 &generic_acl_access_handler, 1599 &generic_acl_default_handler, 1600#endif 1601 NULL 1602}; 1603 1604static int shmem_xattr_validate(const char *name) 1605{ 1606 struct { const char *prefix; size_t len; } arr[] = { 1607 { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN }, 1608 { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN } 1609 }; 1610 int i; 1611 1612 for (i = 0; i < ARRAY_SIZE(arr); i++) { 1613 size_t preflen = arr[i].len; 1614 if (strncmp(name, arr[i].prefix, preflen) == 0) { 1615 if (!name[preflen]) 1616 return -EINVAL; 1617 return 0; 1618 } 1619 } 1620 return -EOPNOTSUPP; 1621} 1622 1623static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, 1624 void *buffer, size_t size) 1625{ 1626 int err; 1627 1628 /* 1629 * If this is a request for a synthetic attribute in the system.* 1630 * namespace use the generic infrastructure to resolve a handler 1631 * for it via sb->s_xattr. 1632 */ 1633 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 1634 return generic_getxattr(dentry, name, buffer, size); 1635 1636 err = shmem_xattr_validate(name); 1637 if (err) 1638 return err; 1639 1640 return shmem_xattr_get(dentry, name, buffer, size); 1641} 1642 1643static int shmem_setxattr(struct dentry *dentry, const char *name, 1644 const void *value, size_t size, int flags) 1645{ 1646 int err; 1647 1648 /* 1649 * If this is a request for a synthetic attribute in the system.* 1650 * namespace use the generic infrastructure to resolve a handler 1651 * for it via sb->s_xattr. 1652 */ 1653 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 1654 return generic_setxattr(dentry, name, value, size, flags); 1655 1656 err = shmem_xattr_validate(name); 1657 if (err) 1658 return err; 1659 1660 if (size == 0) 1661 value = ""; /* empty EA, do not remove */ 1662 1663 return shmem_xattr_set(dentry, name, value, size, flags); 1664 1665} 1666 1667static int shmem_removexattr(struct dentry *dentry, const char *name) 1668{ 1669 int err; 1670 1671 /* 1672 * If this is a request for a synthetic attribute in the system.* 1673 * namespace use the generic infrastructure to resolve a handler 1674 * for it via sb->s_xattr. 1675 */ 1676 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 1677 return generic_removexattr(dentry, name); 1678 1679 err = shmem_xattr_validate(name); 1680 if (err) 1681 return err; 1682 1683 return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE); 1684} 1685 1686static bool xattr_is_trusted(const char *name) 1687{ 1688 return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); 1689} 1690 1691static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) 1692{ 1693 bool trusted = capable(CAP_SYS_ADMIN); 1694 struct shmem_xattr *xattr; 1695 struct shmem_inode_info *info; 1696 size_t used = 0; 1697 1698 info = SHMEM_I(dentry->d_inode); 1699 1700 spin_lock(&info->lock); 1701 list_for_each_entry(xattr, &info->xattr_list, list) { 1702 size_t len; 1703 1704 /* skip "trusted." attributes for unprivileged callers */ 1705 if (!trusted && xattr_is_trusted(xattr->name)) 1706 continue; 1707 1708 len = strlen(xattr->name) + 1; 1709 used += len; 1710 if (buffer) { 1711 if (size < used) { 1712 used = -ERANGE; 1713 break; 1714 } 1715 memcpy(buffer, xattr->name, len); 1716 buffer += len; 1717 } 1718 } 1719 spin_unlock(&info->lock); 1720 1721 return used; 1722} 1723#endif /* CONFIG_TMPFS_XATTR */ 1724 1725static const struct inode_operations shmem_symlink_inline_operations = { 1726 .readlink = generic_readlink, 1727 .follow_link = shmem_follow_link_inline, 1728#ifdef CONFIG_TMPFS_XATTR 1729 .setxattr = shmem_setxattr, 1730 .getxattr = shmem_getxattr, 1731 .listxattr = shmem_listxattr, 1732 .removexattr = shmem_removexattr, 1733#endif 1734}; 1735 1736static const struct inode_operations shmem_symlink_inode_operations = { 1737 .readlink = generic_readlink, 1738 .follow_link = shmem_follow_link, 1739 .put_link = shmem_put_link, 1740#ifdef CONFIG_TMPFS_XATTR 1741 .setxattr = shmem_setxattr, 1742 .getxattr = shmem_getxattr, 1743 .listxattr = shmem_listxattr, 1744 .removexattr = shmem_removexattr, 1745#endif 1746}; 1747 1748static struct dentry *shmem_get_parent(struct dentry *child) 1749{ 1750 return ERR_PTR(-ESTALE); 1751} 1752 1753static int shmem_match(struct inode *ino, void *vfh) 1754{ 1755 __u32 *fh = vfh; 1756 __u64 inum = fh[2]; 1757 inum = (inum << 32) | fh[1]; 1758 return ino->i_ino == inum && fh[0] == ino->i_generation; 1759} 1760 1761static struct dentry *shmem_fh_to_dentry(struct super_block *sb, 1762 struct fid *fid, int fh_len, int fh_type) 1763{ 1764 struct inode *inode; 1765 struct dentry *dentry = NULL; 1766 u64 inum = fid->raw[2]; 1767 inum = (inum << 32) | fid->raw[1]; 1768 1769 if (fh_len < 3) 1770 return NULL; 1771 1772 inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), 1773 shmem_match, fid->raw); 1774 if (inode) { 1775 dentry = d_find_alias(inode); 1776 iput(inode); 1777 } 1778 1779 return dentry; 1780} 1781 1782static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, 1783 int connectable) 1784{ 1785 struct inode *inode = dentry->d_inode; 1786 1787 if (*len < 3) { 1788 *len = 3; 1789 return 255; 1790 } 1791 1792 if (inode_unhashed(inode)) { 1793 /* Unfortunately insert_inode_hash is not idempotent, 1794 * so as we hash inodes here rather than at creation 1795 * time, we need a lock to ensure we only try 1796 * to do it once 1797 */ 1798 static DEFINE_SPINLOCK(lock); 1799 spin_lock(&lock); 1800 if (inode_unhashed(inode)) 1801 __insert_inode_hash(inode, 1802 inode->i_ino + inode->i_generation); 1803 spin_unlock(&lock); 1804 } 1805 1806 fh[0] = inode->i_generation; 1807 fh[1] = inode->i_ino; 1808 fh[2] = ((__u64)inode->i_ino) >> 32; 1809 1810 *len = 3; 1811 return 1; 1812} 1813 1814static const struct export_operations shmem_export_ops = { 1815 .get_parent = shmem_get_parent, 1816 .encode_fh = shmem_encode_fh, 1817 .fh_to_dentry = shmem_fh_to_dentry, 1818}; 1819 1820static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, 1821 bool remount) 1822{ 1823 char *this_char, *value, *rest; 1824 1825 while (options != NULL) { 1826 this_char = options; 1827 for (;;) { 1828 /* 1829 * NUL-terminate this option: unfortunately, 1830 * mount options form a comma-separated list, 1831 * but mpol's nodelist may also contain commas. 1832 */ 1833 options = strchr(options, ','); 1834 if (options == NULL) 1835 break; 1836 options++; 1837 if (!isdigit(*options)) { 1838 options[-1] = '\0'; 1839 break; 1840 } 1841 } 1842 if (!*this_char) 1843 continue; 1844 if ((value = strchr(this_char,'=')) != NULL) { 1845 *value++ = 0; 1846 } else { 1847 printk(KERN_ERR 1848 "tmpfs: No value for mount option '%s'\n", 1849 this_char); 1850 return 1; 1851 } 1852 1853 if (!strcmp(this_char,"size")) { 1854 unsigned long long size; 1855 size = memparse(value,&rest); 1856 if (*rest == '%') { 1857 size <<= PAGE_SHIFT; 1858 size *= totalram_pages; 1859 do_div(size, 100); 1860 rest++; 1861 } 1862 if (*rest) 1863 goto bad_val; 1864 sbinfo->max_blocks = 1865 DIV_ROUND_UP(size, PAGE_CACHE_SIZE); 1866 } else if (!strcmp(this_char,"nr_blocks")) { 1867 sbinfo->max_blocks = memparse(value, &rest); 1868 if (*rest) 1869 goto bad_val; 1870 } else if (!strcmp(this_char,"nr_inodes")) { 1871 sbinfo->max_inodes = memparse(value, &rest); 1872 if (*rest) 1873 goto bad_val; 1874 } else if (!strcmp(this_char,"mode")) { 1875 if (remount) 1876 continue; 1877 sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777; 1878 if (*rest) 1879 goto bad_val; 1880 } else if (!strcmp(this_char,"uid")) { 1881 if (remount) 1882 continue; 1883 sbinfo->uid = simple_strtoul(value, &rest, 0); 1884 if (*rest) 1885 goto bad_val; 1886 } else if (!strcmp(this_char,"gid")) { 1887 if (remount) 1888 continue; 1889 sbinfo->gid = simple_strtoul(value, &rest, 0); 1890 if (*rest) 1891 goto bad_val; 1892 } else if (!strcmp(this_char,"mpol")) { 1893 if (mpol_parse_str(value, &sbinfo->mpol, 1)) 1894 goto bad_val; 1895 } else { 1896 printk(KERN_ERR "tmpfs: Bad mount option %s\n", 1897 this_char); 1898 return 1; 1899 } 1900 } 1901 return 0; 1902 1903bad_val: 1904 printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", 1905 value, this_char); 1906 return 1; 1907 1908} 1909 1910static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) 1911{ 1912 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 1913 struct shmem_sb_info config = *sbinfo; 1914 unsigned long inodes; 1915 int error = -EINVAL; 1916 1917 if (shmem_parse_options(data, &config, true)) 1918 return error; 1919 1920 spin_lock(&sbinfo->stat_lock); 1921 inodes = sbinfo->max_inodes - sbinfo->free_inodes; 1922 if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0) 1923 goto out; 1924 if (config.max_inodes < inodes) 1925 goto out; 1926 /* 1927 * Those tests also disallow limited->unlimited while any are in 1928 * use, so i_blocks will always be zero when max_blocks is zero; 1929 * but we must separately disallow unlimited->limited, because 1930 * in that case we have no record of how much is already in use. 1931 */ 1932 if (config.max_blocks && !sbinfo->max_blocks) 1933 goto out; 1934 if (config.max_inodes && !sbinfo->max_inodes) 1935 goto out; 1936 1937 error = 0; 1938 sbinfo->max_blocks = config.max_blocks; 1939 sbinfo->max_inodes = config.max_inodes; 1940 sbinfo->free_inodes = config.max_inodes - inodes; 1941 1942 mpol_put(sbinfo->mpol); 1943 sbinfo->mpol = config.mpol; /* transfers initial ref */ 1944out: 1945 spin_unlock(&sbinfo->stat_lock); 1946 return error; 1947} 1948 1949static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs) 1950{ 1951 struct shmem_sb_info *sbinfo = SHMEM_SB(vfs->mnt_sb); 1952 1953 if (sbinfo->max_blocks != shmem_default_max_blocks()) 1954 seq_printf(seq, ",size=%luk", 1955 sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10)); 1956 if (sbinfo->max_inodes != shmem_default_max_inodes()) 1957 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); 1958 if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) 1959 seq_printf(seq, ",mode=%03o", sbinfo->mode); 1960 if (sbinfo->uid != 0) 1961 seq_printf(seq, ",uid=%u", sbinfo->uid); 1962 if (sbinfo->gid != 0) 1963 seq_printf(seq, ",gid=%u", sbinfo->gid); 1964 shmem_show_mpol(seq, sbinfo->mpol); 1965 return 0; 1966} 1967#endif /* CONFIG_TMPFS */ 1968 1969static void shmem_put_super(struct super_block *sb) 1970{ 1971 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 1972 1973 percpu_counter_destroy(&sbinfo->used_blocks); 1974 kfree(sbinfo); 1975 sb->s_fs_info = NULL; 1976} 1977 1978int shmem_fill_super(struct super_block *sb, void *data, int silent) 1979{ 1980 struct inode *inode; 1981 struct dentry *root; 1982 struct shmem_sb_info *sbinfo; 1983 int err = -ENOMEM; 1984 1985 /* Round up to L1_CACHE_BYTES to resist false sharing */ 1986 sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), 1987 L1_CACHE_BYTES), GFP_KERNEL); 1988 if (!sbinfo) 1989 return -ENOMEM; 1990 1991 sbinfo->mode = S_IRWXUGO | S_ISVTX; 1992 sbinfo->uid = current_fsuid(); 1993 sbinfo->gid = current_fsgid(); 1994 sb->s_fs_info = sbinfo; 1995 1996#ifdef CONFIG_TMPFS 1997 /* 1998 * Per default we only allow half of the physical ram per 1999 * tmpfs instance, limiting inodes to one per page of lowmem; 2000 * but the internal instance is left unlimited. 2001 */ 2002 if (!(sb->s_flags & MS_NOUSER)) { 2003 sbinfo->max_blocks = shmem_default_max_blocks(); 2004 sbinfo->max_inodes = shmem_default_max_inodes(); 2005 if (shmem_parse_options(data, sbinfo, false)) { 2006 err = -EINVAL; 2007 goto failed; 2008 } 2009 } 2010 sb->s_export_op = &shmem_export_ops; 2011#else 2012 sb->s_flags |= MS_NOUSER; 2013#endif 2014 2015 spin_lock_init(&sbinfo->stat_lock); 2016 if (percpu_counter_init(&sbinfo->used_blocks, 0)) 2017 goto failed; 2018 sbinfo->free_inodes = sbinfo->max_inodes; 2019 2020 sb->s_maxbytes = MAX_LFS_FILESIZE; 2021 sb->s_blocksize = PAGE_CACHE_SIZE; 2022 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 2023 sb->s_magic = TMPFS_MAGIC; 2024 sb->s_op = &shmem_ops; 2025 sb->s_time_gran = 1; 2026#ifdef CONFIG_TMPFS_XATTR 2027 sb->s_xattr = shmem_xattr_handlers; 2028#endif 2029#ifdef CONFIG_TMPFS_POSIX_ACL 2030 sb->s_flags |= MS_POSIXACL; 2031#endif 2032 2033 inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); 2034 if (!inode) 2035 goto failed; 2036 inode->i_uid = sbinfo->uid; 2037 inode->i_gid = sbinfo->gid; 2038 root = d_alloc_root(inode); 2039 if (!root) 2040 goto failed_iput; 2041 sb->s_root = root; 2042 return 0; 2043 2044failed_iput: 2045 iput(inode); 2046failed: 2047 shmem_put_super(sb); 2048 return err; 2049} 2050 2051static struct kmem_cache *shmem_inode_cachep; 2052 2053static struct inode *shmem_alloc_inode(struct super_block *sb) 2054{ 2055 struct shmem_inode_info *p; 2056 p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); 2057 if (!p) 2058 return NULL; 2059 return &p->vfs_inode; 2060} 2061 2062static void shmem_i_callback(struct rcu_head *head) 2063{ 2064 struct inode *inode = container_of(head, struct inode, i_rcu); 2065 INIT_LIST_HEAD(&inode->i_dentry); 2066 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2067} 2068 2069static void shmem_destroy_inode(struct inode *inode) 2070{ 2071 if ((inode->i_mode & S_IFMT) == S_IFREG) { 2072 /* only struct inode is valid if it's an inline symlink */ 2073 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 2074 } 2075 call_rcu(&inode->i_rcu, shmem_i_callback); 2076} 2077 2078static void init_once(void *foo) 2079{ 2080 struct shmem_inode_info *p = (struct shmem_inode_info *) foo; 2081 2082 inode_init_once(&p->vfs_inode); 2083} 2084 2085static int init_inodecache(void) 2086{ 2087 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 2088 sizeof(struct shmem_inode_info), 2089 0, SLAB_PANIC, init_once); 2090 return 0; 2091} 2092 2093static void destroy_inodecache(void) 2094{ 2095 kmem_cache_destroy(shmem_inode_cachep); 2096} 2097 2098static const struct address_space_operations shmem_aops = { 2099 .writepage = shmem_writepage, 2100 .set_page_dirty = __set_page_dirty_no_writeback, 2101#ifdef CONFIG_TMPFS 2102 .write_begin = shmem_write_begin, 2103 .write_end = shmem_write_end, 2104#endif 2105 .migratepage = migrate_page, 2106 .error_remove_page = generic_error_remove_page, 2107}; 2108 2109static const struct file_operations shmem_file_operations = { 2110 .mmap = shmem_mmap, 2111#ifdef CONFIG_TMPFS 2112 .llseek = generic_file_llseek, 2113 .read = do_sync_read, 2114 .write = do_sync_write, 2115 .aio_read = shmem_file_aio_read, 2116 .aio_write = generic_file_aio_write, 2117 .fsync = noop_fsync, 2118 .splice_read = shmem_file_splice_read, 2119 .splice_write = generic_file_splice_write, 2120#endif 2121}; 2122 2123static const struct inode_operations shmem_inode_operations = { 2124 .setattr = shmem_setattr, 2125 .truncate_range = shmem_truncate_range, 2126#ifdef CONFIG_TMPFS_XATTR 2127 .setxattr = shmem_setxattr, 2128 .getxattr = shmem_getxattr, 2129 .listxattr = shmem_listxattr, 2130 .removexattr = shmem_removexattr, 2131#endif 2132}; 2133 2134static const struct inode_operations shmem_dir_inode_operations = { 2135#ifdef CONFIG_TMPFS 2136 .create = shmem_create, 2137 .lookup = simple_lookup, 2138 .link = shmem_link, 2139 .unlink = shmem_unlink, 2140 .symlink = shmem_symlink, 2141 .mkdir = shmem_mkdir, 2142 .rmdir = shmem_rmdir, 2143 .mknod = shmem_mknod, 2144 .rename = shmem_rename, 2145#endif 2146#ifdef CONFIG_TMPFS_XATTR 2147 .setxattr = shmem_setxattr, 2148 .getxattr = shmem_getxattr, 2149 .listxattr = shmem_listxattr, 2150 .removexattr = shmem_removexattr, 2151#endif 2152#ifdef CONFIG_TMPFS_POSIX_ACL 2153 .setattr = shmem_setattr, 2154#endif 2155}; 2156 2157static const struct inode_operations shmem_special_inode_operations = { 2158#ifdef CONFIG_TMPFS_XATTR 2159 .setxattr = shmem_setxattr, 2160 .getxattr = shmem_getxattr, 2161 .listxattr = shmem_listxattr, 2162 .removexattr = shmem_removexattr, 2163#endif 2164#ifdef CONFIG_TMPFS_POSIX_ACL 2165 .setattr = shmem_setattr, 2166#endif 2167}; 2168 2169static const struct super_operations shmem_ops = { 2170 .alloc_inode = shmem_alloc_inode, 2171 .destroy_inode = shmem_destroy_inode, 2172#ifdef CONFIG_TMPFS 2173 .statfs = shmem_statfs, 2174 .remount_fs = shmem_remount_fs, 2175 .show_options = shmem_show_options, 2176#endif 2177 .evict_inode = shmem_evict_inode, 2178 .drop_inode = generic_delete_inode, 2179 .put_super = shmem_put_super, 2180}; 2181 2182static const struct vm_operations_struct shmem_vm_ops = { 2183 .fault = shmem_fault, 2184#ifdef CONFIG_NUMA 2185 .set_policy = shmem_set_policy, 2186 .get_policy = shmem_get_policy, 2187#endif 2188}; 2189 2190 2191static struct dentry *shmem_mount(struct file_system_type *fs_type, 2192 int flags, const char *dev_name, void *data) 2193{ 2194 return mount_nodev(fs_type, flags, data, shmem_fill_super); 2195} 2196 2197static struct file_system_type tmpfs_fs_type = { 2198 .owner = THIS_MODULE, 2199 .name = "tmpfs", 2200 .mount = shmem_mount, 2201 .kill_sb = kill_litter_super, 2202}; 2203 2204int __init init_tmpfs(void) 2205{ 2206 int error; 2207 2208 error = bdi_init(&shmem_backing_dev_info); 2209 if (error) 2210 goto out4; 2211 2212 error = init_inodecache(); 2213 if (error) 2214 goto out3; 2215 2216 error = register_filesystem(&tmpfs_fs_type); 2217 if (error) { 2218 printk(KERN_ERR "Could not register tmpfs\n"); 2219 goto out2; 2220 } 2221 2222 shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER, 2223 tmpfs_fs_type.name, NULL); 2224 if (IS_ERR(shm_mnt)) { 2225 error = PTR_ERR(shm_mnt); 2226 printk(KERN_ERR "Could not kern_mount tmpfs\n"); 2227 goto out1; 2228 } 2229 return 0; 2230 2231out1: 2232 unregister_filesystem(&tmpfs_fs_type); 2233out2: 2234 destroy_inodecache(); 2235out3: 2236 bdi_destroy(&shmem_backing_dev_info); 2237out4: 2238 shm_mnt = ERR_PTR(error); 2239 return error; 2240} 2241 2242#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2243/** 2244 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file 2245 * @inode: the inode to be searched 2246 * @pgoff: the offset to be searched 2247 * @pagep: the pointer for the found page to be stored 2248 * @ent: the pointer for the found swap entry to be stored 2249 * 2250 * If a page is found, refcount of it is incremented. Callers should handle 2251 * these refcount. 2252 */ 2253void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, 2254 struct page **pagep, swp_entry_t *ent) 2255{ 2256 swp_entry_t entry = { .val = 0 }; 2257 struct page *page = NULL; 2258 struct shmem_inode_info *info = SHMEM_I(inode); 2259 2260 if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) 2261 goto out; 2262 2263 spin_lock(&info->lock); 2264#ifdef CONFIG_SWAP 2265 entry = shmem_get_swap(info, pgoff); 2266 if (entry.val) 2267 page = find_get_page(&swapper_space, entry.val); 2268 else 2269#endif 2270 page = find_get_page(inode->i_mapping, pgoff); 2271 spin_unlock(&info->lock); 2272out: 2273 *pagep = page; 2274 *ent = entry; 2275} 2276#endif 2277 2278#else /* !CONFIG_SHMEM */ 2279 2280/* 2281 * tiny-shmem: simple shmemfs and tmpfs using ramfs code 2282 * 2283 * This is intended for small system where the benefits of the full 2284 * shmem code (swap-backed and resource-limited) are outweighed by 2285 * their complexity. On systems without swap this code should be 2286 * effectively equivalent, but much lighter weight. 2287 */ 2288 2289#include <linux/ramfs.h> 2290 2291static struct file_system_type tmpfs_fs_type = { 2292 .name = "tmpfs", 2293 .mount = ramfs_mount, 2294 .kill_sb = kill_litter_super, 2295}; 2296 2297int __init init_tmpfs(void) 2298{ 2299 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); 2300 2301 shm_mnt = kern_mount(&tmpfs_fs_type); 2302 BUG_ON(IS_ERR(shm_mnt)); 2303 2304 return 0; 2305} 2306 2307int shmem_unuse(swp_entry_t entry, struct page *page) 2308{ 2309 return 0; 2310} 2311 2312int shmem_lock(struct file *file, int lock, struct user_struct *user) 2313{ 2314 return 0; 2315} 2316 2317void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) 2318{ 2319 truncate_inode_pages_range(inode->i_mapping, start, end); 2320} 2321EXPORT_SYMBOL_GPL(shmem_truncate_range); 2322 2323#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2324/** 2325 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file 2326 * @inode: the inode to be searched 2327 * @pgoff: the offset to be searched 2328 * @pagep: the pointer for the found page to be stored 2329 * @ent: the pointer for the found swap entry to be stored 2330 * 2331 * If a page is found, refcount of it is incremented. Callers should handle 2332 * these refcount. 2333 */ 2334void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, 2335 struct page **pagep, swp_entry_t *ent) 2336{ 2337 struct page *page = NULL; 2338 2339 if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) 2340 goto out; 2341 page = find_get_page(inode->i_mapping, pgoff); 2342out: 2343 *pagep = page; 2344 *ent = (swp_entry_t){ .val = 0 }; 2345} 2346#endif 2347 2348#define shmem_vm_ops generic_file_vm_ops 2349#define shmem_file_operations ramfs_file_operations 2350#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) 2351#define shmem_acct_size(flags, size) 0 2352#define shmem_unacct_size(flags, size) do {} while (0) 2353 2354#endif /* CONFIG_SHMEM */ 2355 2356/* common code */ 2357 2358/** 2359 * shmem_file_setup - get an unlinked file living in tmpfs 2360 * @name: name for dentry (to be seen in /proc/<pid>/maps 2361 * @size: size to be set for the file 2362 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 2363 */ 2364struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) 2365{ 2366 int error; 2367 struct file *file; 2368 struct inode *inode; 2369 struct path path; 2370 struct dentry *root; 2371 struct qstr this; 2372 2373 if (IS_ERR(shm_mnt)) 2374 return (void *)shm_mnt; 2375 2376 if (size < 0 || size > MAX_LFS_FILESIZE) 2377 return ERR_PTR(-EINVAL); 2378 2379 if (shmem_acct_size(flags, size)) 2380 return ERR_PTR(-ENOMEM); 2381 2382 error = -ENOMEM; 2383 this.name = name; 2384 this.len = strlen(name); 2385 this.hash = 0; /* will go */ 2386 root = shm_mnt->mnt_root; 2387 path.dentry = d_alloc(root, &this); 2388 if (!path.dentry) 2389 goto put_memory; 2390 path.mnt = mntget(shm_mnt); 2391 2392 error = -ENOSPC; 2393 inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); 2394 if (!inode) 2395 goto put_dentry; 2396 2397 d_instantiate(path.dentry, inode); 2398 inode->i_size = size; 2399 inode->i_nlink = 0; /* It is unlinked */ 2400#ifndef CONFIG_MMU 2401 error = ramfs_nommu_expand_for_mapping(inode, size); 2402 if (error) 2403 goto put_dentry; 2404#endif 2405 2406 error = -ENFILE; 2407 file = alloc_file(&path, FMODE_WRITE | FMODE_READ, 2408 &shmem_file_operations); 2409 if (!file) 2410 goto put_dentry; 2411 2412 return file; 2413 2414put_dentry: 2415 path_put(&path); 2416put_memory: 2417 shmem_unacct_size(flags, size); 2418 return ERR_PTR(error); 2419} 2420EXPORT_SYMBOL_GPL(shmem_file_setup); 2421 2422/** 2423 * shmem_zero_setup - setup a shared anonymous mapping 2424 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff 2425 */ 2426int shmem_zero_setup(struct vm_area_struct *vma) 2427{ 2428 struct file *file; 2429 loff_t size = vma->vm_end - vma->vm_start; 2430 2431 file = shmem_file_setup("dev/zero", size, vma->vm_flags); 2432 if (IS_ERR(file)) 2433 return PTR_ERR(file); 2434 2435 if (vma->vm_file) 2436 fput(vma->vm_file); 2437 vma->vm_file = file; 2438 vma->vm_ops = &shmem_vm_ops; 2439 vma->vm_flags |= VM_CAN_NONLINEAR; 2440 return 0; 2441} 2442 2443/** 2444 * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. 2445 * @mapping: the page's address_space 2446 * @index: the page index 2447 * @gfp: the page allocator flags to use if allocating 2448 * 2449 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", 2450 * with any new page allocations done using the specified allocation flags. 2451 * But read_cache_page_gfp() uses the ->readpage() method: which does not 2452 * suit tmpfs, since it may have pages in swapcache, and needs to find those 2453 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. 2454 * 2455 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in 2456 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. 2457 */ 2458struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 2459 pgoff_t index, gfp_t gfp) 2460{ 2461#ifdef CONFIG_SHMEM 2462 struct inode *inode = mapping->host; 2463 struct page *page; 2464 int error; 2465 2466 BUG_ON(mapping->a_ops != &shmem_aops); 2467 error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL); 2468 if (error) 2469 page = ERR_PTR(error); 2470 else 2471 unlock_page(page); 2472 return page; 2473#else 2474 /* 2475 * The tiny !SHMEM case uses ramfs without swap 2476 */ 2477 return read_cache_page_gfp(mapping, index, gfp); 2478#endif 2479} 2480EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); 2481