shmem.c revision 48f170fb7d7db8789ccc23e051af61f62af5f685
1/* 2 * Resizable virtual memory filesystem for Linux. 3 * 4 * Copyright (C) 2000 Linus Torvalds. 5 * 2000 Transmeta Corp. 6 * 2000-2001 Christoph Rohland 7 * 2000-2001 SAP AG 8 * 2002 Red Hat Inc. 9 * Copyright (C) 2002-2005 Hugh Dickins. 10 * Copyright (C) 2002-2005 VERITAS Software Corporation. 11 * Copyright (C) 2004 Andi Kleen, SuSE Labs 12 * 13 * Extended attribute support for tmpfs: 14 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> 15 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> 16 * 17 * tiny-shmem: 18 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> 19 * 20 * This file is released under the GPL. 21 */ 22 23#include <linux/fs.h> 24#include <linux/init.h> 25#include <linux/vfs.h> 26#include <linux/mount.h> 27#include <linux/pagemap.h> 28#include <linux/file.h> 29#include <linux/mm.h> 30#include <linux/module.h> 31#include <linux/percpu_counter.h> 32#include <linux/swap.h> 33 34static struct vfsmount *shm_mnt; 35 36#ifdef CONFIG_SHMEM 37/* 38 * This virtual memory filesystem is heavily based on the ramfs. It 39 * extends ramfs by the ability to use swap and honor resource limits 40 * which makes it a completely usable filesystem. 41 */ 42 43#include <linux/xattr.h> 44#include <linux/exportfs.h> 45#include <linux/posix_acl.h> 46#include <linux/generic_acl.h> 47#include <linux/mman.h> 48#include <linux/string.h> 49#include <linux/slab.h> 50#include <linux/backing-dev.h> 51#include <linux/shmem_fs.h> 52#include <linux/writeback.h> 53#include <linux/blkdev.h> 54#include <linux/splice.h> 55#include <linux/security.h> 56#include <linux/swapops.h> 57#include <linux/mempolicy.h> 58#include <linux/namei.h> 59#include <linux/ctype.h> 60#include <linux/migrate.h> 61#include <linux/highmem.h> 62#include <linux/seq_file.h> 63#include <linux/magic.h> 64 65#include <asm/uaccess.h> 66#include <asm/div64.h> 67#include <asm/pgtable.h> 68 69/* 70 * The maximum size of a shmem/tmpfs file is limited by the maximum size of 71 * its triple-indirect swap vector - see illustration at shmem_swp_entry(). 72 * 73 * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel, 74 * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum 75 * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel, 76 * MAX_LFS_FILESIZE being then more restrictive than swap vector layout. 77 * 78 * We use / and * instead of shifts in the definitions below, so that the swap 79 * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE. 80 */ 81#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) 82#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) 83 84#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) 85#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT) 86 87#define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE) 88#define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT)) 89 90#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) 91#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) 92 93/* info->flags needs VM_flags to handle pagein/truncate races efficiently */ 94#define SHMEM_PAGEIN VM_READ 95#define SHMEM_TRUNCATE VM_WRITE 96 97/* Definition to limit shmem_truncate's steps between cond_rescheds */ 98#define LATENCY_LIMIT 64 99 100/* Pretend that each entry is of this size in directory's i_size */ 101#define BOGO_DIRENT_SIZE 20 102 103struct shmem_xattr { 104 struct list_head list; /* anchored by shmem_inode_info->xattr_list */ 105 char *name; /* xattr name */ 106 size_t size; 107 char value[0]; 108}; 109 110/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ 111enum sgp_type { 112 SGP_READ, /* don't exceed i_size, don't allocate page */ 113 SGP_CACHE, /* don't exceed i_size, may allocate page */ 114 SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ 115 SGP_WRITE, /* may exceed i_size, may allocate page */ 116}; 117 118#ifdef CONFIG_TMPFS 119static unsigned long shmem_default_max_blocks(void) 120{ 121 return totalram_pages / 2; 122} 123 124static unsigned long shmem_default_max_inodes(void) 125{ 126 return min(totalram_pages - totalhigh_pages, totalram_pages / 2); 127} 128#endif 129 130static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 131 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); 132 133static inline int shmem_getpage(struct inode *inode, pgoff_t index, 134 struct page **pagep, enum sgp_type sgp, int *fault_type) 135{ 136 return shmem_getpage_gfp(inode, index, pagep, sgp, 137 mapping_gfp_mask(inode->i_mapping), fault_type); 138} 139 140static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) 141{ 142 /* 143 * The above definition of ENTRIES_PER_PAGE, and the use of 144 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: 145 * might be reconsidered if it ever diverges from PAGE_SIZE. 146 * 147 * Mobility flags are masked out as swap vectors cannot move 148 */ 149 return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO, 150 PAGE_CACHE_SHIFT-PAGE_SHIFT); 151} 152 153static inline void shmem_dir_free(struct page *page) 154{ 155 __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT); 156} 157 158static struct page **shmem_dir_map(struct page *page) 159{ 160 return (struct page **)kmap_atomic(page, KM_USER0); 161} 162 163static inline void shmem_dir_unmap(struct page **dir) 164{ 165 kunmap_atomic(dir, KM_USER0); 166} 167 168static swp_entry_t *shmem_swp_map(struct page *page) 169{ 170 return (swp_entry_t *)kmap_atomic(page, KM_USER1); 171} 172 173static inline void shmem_swp_balance_unmap(void) 174{ 175 /* 176 * When passing a pointer to an i_direct entry, to code which 177 * also handles indirect entries and so will shmem_swp_unmap, 178 * we must arrange for the preempt count to remain in balance. 179 * What kmap_atomic of a lowmem page does depends on config 180 * and architecture, so pretend to kmap_atomic some lowmem page. 181 */ 182 (void) kmap_atomic(ZERO_PAGE(0), KM_USER1); 183} 184 185static inline void shmem_swp_unmap(swp_entry_t *entry) 186{ 187 kunmap_atomic(entry, KM_USER1); 188} 189 190static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) 191{ 192 return sb->s_fs_info; 193} 194 195/* 196 * shmem_file_setup pre-accounts the whole fixed size of a VM object, 197 * for shared memory and for shared anonymous (/dev/zero) mappings 198 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), 199 * consistent with the pre-accounting of private mappings ... 200 */ 201static inline int shmem_acct_size(unsigned long flags, loff_t size) 202{ 203 return (flags & VM_NORESERVE) ? 204 0 : security_vm_enough_memory_kern(VM_ACCT(size)); 205} 206 207static inline void shmem_unacct_size(unsigned long flags, loff_t size) 208{ 209 if (!(flags & VM_NORESERVE)) 210 vm_unacct_memory(VM_ACCT(size)); 211} 212 213/* 214 * ... whereas tmpfs objects are accounted incrementally as 215 * pages are allocated, in order to allow huge sparse files. 216 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, 217 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. 218 */ 219static inline int shmem_acct_block(unsigned long flags) 220{ 221 return (flags & VM_NORESERVE) ? 222 security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)) : 0; 223} 224 225static inline void shmem_unacct_blocks(unsigned long flags, long pages) 226{ 227 if (flags & VM_NORESERVE) 228 vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE)); 229} 230 231static const struct super_operations shmem_ops; 232static const struct address_space_operations shmem_aops; 233static const struct file_operations shmem_file_operations; 234static const struct inode_operations shmem_inode_operations; 235static const struct inode_operations shmem_dir_inode_operations; 236static const struct inode_operations shmem_special_inode_operations; 237static const struct vm_operations_struct shmem_vm_ops; 238 239static struct backing_dev_info shmem_backing_dev_info __read_mostly = { 240 .ra_pages = 0, /* No readahead */ 241 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 242}; 243 244static LIST_HEAD(shmem_swaplist); 245static DEFINE_MUTEX(shmem_swaplist_mutex); 246 247static void shmem_free_blocks(struct inode *inode, long pages) 248{ 249 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 250 if (sbinfo->max_blocks) { 251 percpu_counter_add(&sbinfo->used_blocks, -pages); 252 inode->i_blocks -= pages*BLOCKS_PER_PAGE; 253 } 254} 255 256static int shmem_reserve_inode(struct super_block *sb) 257{ 258 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 259 if (sbinfo->max_inodes) { 260 spin_lock(&sbinfo->stat_lock); 261 if (!sbinfo->free_inodes) { 262 spin_unlock(&sbinfo->stat_lock); 263 return -ENOSPC; 264 } 265 sbinfo->free_inodes--; 266 spin_unlock(&sbinfo->stat_lock); 267 } 268 return 0; 269} 270 271static void shmem_free_inode(struct super_block *sb) 272{ 273 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 274 if (sbinfo->max_inodes) { 275 spin_lock(&sbinfo->stat_lock); 276 sbinfo->free_inodes++; 277 spin_unlock(&sbinfo->stat_lock); 278 } 279} 280 281/** 282 * shmem_recalc_inode - recalculate the size of an inode 283 * @inode: inode to recalc 284 * 285 * We have to calculate the free blocks since the mm can drop 286 * undirtied hole pages behind our back. 287 * 288 * But normally info->alloced == inode->i_mapping->nrpages + info->swapped 289 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) 290 * 291 * It has to be called with the spinlock held. 292 */ 293static void shmem_recalc_inode(struct inode *inode) 294{ 295 struct shmem_inode_info *info = SHMEM_I(inode); 296 long freed; 297 298 freed = info->alloced - info->swapped - inode->i_mapping->nrpages; 299 if (freed > 0) { 300 info->alloced -= freed; 301 shmem_unacct_blocks(info->flags, freed); 302 shmem_free_blocks(inode, freed); 303 } 304} 305 306/** 307 * shmem_swp_entry - find the swap vector position in the info structure 308 * @info: info structure for the inode 309 * @index: index of the page to find 310 * @page: optional page to add to the structure. Has to be preset to 311 * all zeros 312 * 313 * If there is no space allocated yet it will return NULL when 314 * page is NULL, else it will use the page for the needed block, 315 * setting it to NULL on return to indicate that it has been used. 316 * 317 * The swap vector is organized the following way: 318 * 319 * There are SHMEM_NR_DIRECT entries directly stored in the 320 * shmem_inode_info structure. So small files do not need an addional 321 * allocation. 322 * 323 * For pages with index > SHMEM_NR_DIRECT there is the pointer 324 * i_indirect which points to a page which holds in the first half 325 * doubly indirect blocks, in the second half triple indirect blocks: 326 * 327 * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the 328 * following layout (for SHMEM_NR_DIRECT == 16): 329 * 330 * i_indirect -> dir --> 16-19 331 * | +-> 20-23 332 * | 333 * +-->dir2 --> 24-27 334 * | +-> 28-31 335 * | +-> 32-35 336 * | +-> 36-39 337 * | 338 * +-->dir3 --> 40-43 339 * +-> 44-47 340 * +-> 48-51 341 * +-> 52-55 342 */ 343static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page) 344{ 345 unsigned long offset; 346 struct page **dir; 347 struct page *subdir; 348 349 if (index < SHMEM_NR_DIRECT) { 350 shmem_swp_balance_unmap(); 351 return info->i_direct+index; 352 } 353 if (!info->i_indirect) { 354 if (page) { 355 info->i_indirect = *page; 356 *page = NULL; 357 } 358 return NULL; /* need another page */ 359 } 360 361 index -= SHMEM_NR_DIRECT; 362 offset = index % ENTRIES_PER_PAGE; 363 index /= ENTRIES_PER_PAGE; 364 dir = shmem_dir_map(info->i_indirect); 365 366 if (index >= ENTRIES_PER_PAGE/2) { 367 index -= ENTRIES_PER_PAGE/2; 368 dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE; 369 index %= ENTRIES_PER_PAGE; 370 subdir = *dir; 371 if (!subdir) { 372 if (page) { 373 *dir = *page; 374 *page = NULL; 375 } 376 shmem_dir_unmap(dir); 377 return NULL; /* need another page */ 378 } 379 shmem_dir_unmap(dir); 380 dir = shmem_dir_map(subdir); 381 } 382 383 dir += index; 384 subdir = *dir; 385 if (!subdir) { 386 if (!page || !(subdir = *page)) { 387 shmem_dir_unmap(dir); 388 return NULL; /* need a page */ 389 } 390 *dir = subdir; 391 *page = NULL; 392 } 393 shmem_dir_unmap(dir); 394 return shmem_swp_map(subdir) + offset; 395} 396 397static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value) 398{ 399 long incdec = value? 1: -1; 400 401 entry->val = value; 402 info->swapped += incdec; 403 if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) { 404 struct page *page = kmap_atomic_to_page(entry); 405 set_page_private(page, page_private(page) + incdec); 406 } 407} 408 409/** 410 * shmem_swp_alloc - get the position of the swap entry for the page. 411 * @info: info structure for the inode 412 * @index: index of the page to find 413 * @sgp: check and recheck i_size? skip allocation? 414 * @gfp: gfp mask to use for any page allocation 415 * 416 * If the entry does not exist, allocate it. 417 */ 418static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, 419 unsigned long index, enum sgp_type sgp, gfp_t gfp) 420{ 421 struct inode *inode = &info->vfs_inode; 422 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 423 struct page *page = NULL; 424 swp_entry_t *entry; 425 426 if (sgp != SGP_WRITE && 427 ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) 428 return ERR_PTR(-EINVAL); 429 430 while (!(entry = shmem_swp_entry(info, index, &page))) { 431 if (sgp == SGP_READ) 432 return shmem_swp_map(ZERO_PAGE(0)); 433 /* 434 * Test used_blocks against 1 less max_blocks, since we have 1 data 435 * page (and perhaps indirect index pages) yet to allocate: 436 * a waste to allocate index if we cannot allocate data. 437 */ 438 if (sbinfo->max_blocks) { 439 if (percpu_counter_compare(&sbinfo->used_blocks, 440 sbinfo->max_blocks - 1) >= 0) 441 return ERR_PTR(-ENOSPC); 442 percpu_counter_inc(&sbinfo->used_blocks); 443 inode->i_blocks += BLOCKS_PER_PAGE; 444 } 445 446 spin_unlock(&info->lock); 447 page = shmem_dir_alloc(gfp); 448 spin_lock(&info->lock); 449 450 if (!page) { 451 shmem_free_blocks(inode, 1); 452 return ERR_PTR(-ENOMEM); 453 } 454 if (sgp != SGP_WRITE && 455 ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { 456 entry = ERR_PTR(-EINVAL); 457 break; 458 } 459 if (info->next_index <= index) 460 info->next_index = index + 1; 461 } 462 if (page) { 463 /* another task gave its page, or truncated the file */ 464 shmem_free_blocks(inode, 1); 465 shmem_dir_free(page); 466 } 467 if (info->next_index <= index && !IS_ERR(entry)) 468 info->next_index = index + 1; 469 return entry; 470} 471 472/** 473 * shmem_free_swp - free some swap entries in a directory 474 * @dir: pointer to the directory 475 * @edir: pointer after last entry of the directory 476 * @punch_lock: pointer to spinlock when needed for the holepunch case 477 */ 478static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir, 479 spinlock_t *punch_lock) 480{ 481 spinlock_t *punch_unlock = NULL; 482 swp_entry_t *ptr; 483 int freed = 0; 484 485 for (ptr = dir; ptr < edir; ptr++) { 486 if (ptr->val) { 487 if (unlikely(punch_lock)) { 488 punch_unlock = punch_lock; 489 punch_lock = NULL; 490 spin_lock(punch_unlock); 491 if (!ptr->val) 492 continue; 493 } 494 free_swap_and_cache(*ptr); 495 *ptr = (swp_entry_t){0}; 496 freed++; 497 } 498 } 499 if (punch_unlock) 500 spin_unlock(punch_unlock); 501 return freed; 502} 503 504static int shmem_map_and_free_swp(struct page *subdir, int offset, 505 int limit, struct page ***dir, spinlock_t *punch_lock) 506{ 507 swp_entry_t *ptr; 508 int freed = 0; 509 510 ptr = shmem_swp_map(subdir); 511 for (; offset < limit; offset += LATENCY_LIMIT) { 512 int size = limit - offset; 513 if (size > LATENCY_LIMIT) 514 size = LATENCY_LIMIT; 515 freed += shmem_free_swp(ptr+offset, ptr+offset+size, 516 punch_lock); 517 if (need_resched()) { 518 shmem_swp_unmap(ptr); 519 if (*dir) { 520 shmem_dir_unmap(*dir); 521 *dir = NULL; 522 } 523 cond_resched(); 524 ptr = shmem_swp_map(subdir); 525 } 526 } 527 shmem_swp_unmap(ptr); 528 return freed; 529} 530 531static void shmem_free_pages(struct list_head *next) 532{ 533 struct page *page; 534 int freed = 0; 535 536 do { 537 page = container_of(next, struct page, lru); 538 next = next->next; 539 shmem_dir_free(page); 540 freed++; 541 if (freed >= LATENCY_LIMIT) { 542 cond_resched(); 543 freed = 0; 544 } 545 } while (next); 546} 547 548void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) 549{ 550 struct shmem_inode_info *info = SHMEM_I(inode); 551 unsigned long idx; 552 unsigned long size; 553 unsigned long limit; 554 unsigned long stage; 555 unsigned long diroff; 556 struct page **dir; 557 struct page *topdir; 558 struct page *middir; 559 struct page *subdir; 560 swp_entry_t *ptr; 561 LIST_HEAD(pages_to_free); 562 long nr_pages_to_free = 0; 563 long nr_swaps_freed = 0; 564 int offset; 565 int freed; 566 int punch_hole; 567 spinlock_t *needs_lock; 568 spinlock_t *punch_lock; 569 unsigned long upper_limit; 570 571 truncate_inode_pages_range(inode->i_mapping, start, end); 572 573 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 574 idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 575 if (idx >= info->next_index) 576 return; 577 578 spin_lock(&info->lock); 579 info->flags |= SHMEM_TRUNCATE; 580 if (likely(end == (loff_t) -1)) { 581 limit = info->next_index; 582 upper_limit = SHMEM_MAX_INDEX; 583 info->next_index = idx; 584 needs_lock = NULL; 585 punch_hole = 0; 586 } else { 587 if (end + 1 >= inode->i_size) { /* we may free a little more */ 588 limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >> 589 PAGE_CACHE_SHIFT; 590 upper_limit = SHMEM_MAX_INDEX; 591 } else { 592 limit = (end + 1) >> PAGE_CACHE_SHIFT; 593 upper_limit = limit; 594 } 595 needs_lock = &info->lock; 596 punch_hole = 1; 597 } 598 599 topdir = info->i_indirect; 600 if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) { 601 info->i_indirect = NULL; 602 nr_pages_to_free++; 603 list_add(&topdir->lru, &pages_to_free); 604 } 605 spin_unlock(&info->lock); 606 607 if (info->swapped && idx < SHMEM_NR_DIRECT) { 608 ptr = info->i_direct; 609 size = limit; 610 if (size > SHMEM_NR_DIRECT) 611 size = SHMEM_NR_DIRECT; 612 nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock); 613 } 614 615 /* 616 * If there are no indirect blocks or we are punching a hole 617 * below indirect blocks, nothing to be done. 618 */ 619 if (!topdir || limit <= SHMEM_NR_DIRECT) 620 goto done2; 621 622 /* 623 * The truncation case has already dropped info->lock, and we're safe 624 * because i_size and next_index have already been lowered, preventing 625 * access beyond. But in the punch_hole case, we still need to take 626 * the lock when updating the swap directory, because there might be 627 * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or 628 * shmem_writepage. However, whenever we find we can remove a whole 629 * directory page (not at the misaligned start or end of the range), 630 * we first NULLify its pointer in the level above, and then have no 631 * need to take the lock when updating its contents: needs_lock and 632 * punch_lock (either pointing to info->lock or NULL) manage this. 633 */ 634 635 upper_limit -= SHMEM_NR_DIRECT; 636 limit -= SHMEM_NR_DIRECT; 637 idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0; 638 offset = idx % ENTRIES_PER_PAGE; 639 idx -= offset; 640 641 dir = shmem_dir_map(topdir); 642 stage = ENTRIES_PER_PAGEPAGE/2; 643 if (idx < ENTRIES_PER_PAGEPAGE/2) { 644 middir = topdir; 645 diroff = idx/ENTRIES_PER_PAGE; 646 } else { 647 dir += ENTRIES_PER_PAGE/2; 648 dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE; 649 while (stage <= idx) 650 stage += ENTRIES_PER_PAGEPAGE; 651 middir = *dir; 652 if (*dir) { 653 diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) % 654 ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE; 655 if (!diroff && !offset && upper_limit >= stage) { 656 if (needs_lock) { 657 spin_lock(needs_lock); 658 *dir = NULL; 659 spin_unlock(needs_lock); 660 needs_lock = NULL; 661 } else 662 *dir = NULL; 663 nr_pages_to_free++; 664 list_add(&middir->lru, &pages_to_free); 665 } 666 shmem_dir_unmap(dir); 667 dir = shmem_dir_map(middir); 668 } else { 669 diroff = 0; 670 offset = 0; 671 idx = stage; 672 } 673 } 674 675 for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) { 676 if (unlikely(idx == stage)) { 677 shmem_dir_unmap(dir); 678 dir = shmem_dir_map(topdir) + 679 ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; 680 while (!*dir) { 681 dir++; 682 idx += ENTRIES_PER_PAGEPAGE; 683 if (idx >= limit) 684 goto done1; 685 } 686 stage = idx + ENTRIES_PER_PAGEPAGE; 687 middir = *dir; 688 if (punch_hole) 689 needs_lock = &info->lock; 690 if (upper_limit >= stage) { 691 if (needs_lock) { 692 spin_lock(needs_lock); 693 *dir = NULL; 694 spin_unlock(needs_lock); 695 needs_lock = NULL; 696 } else 697 *dir = NULL; 698 nr_pages_to_free++; 699 list_add(&middir->lru, &pages_to_free); 700 } 701 shmem_dir_unmap(dir); 702 cond_resched(); 703 dir = shmem_dir_map(middir); 704 diroff = 0; 705 } 706 punch_lock = needs_lock; 707 subdir = dir[diroff]; 708 if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) { 709 if (needs_lock) { 710 spin_lock(needs_lock); 711 dir[diroff] = NULL; 712 spin_unlock(needs_lock); 713 punch_lock = NULL; 714 } else 715 dir[diroff] = NULL; 716 nr_pages_to_free++; 717 list_add(&subdir->lru, &pages_to_free); 718 } 719 if (subdir && page_private(subdir) /* has swap entries */) { 720 size = limit - idx; 721 if (size > ENTRIES_PER_PAGE) 722 size = ENTRIES_PER_PAGE; 723 freed = shmem_map_and_free_swp(subdir, 724 offset, size, &dir, punch_lock); 725 if (!dir) 726 dir = shmem_dir_map(middir); 727 nr_swaps_freed += freed; 728 if (offset || punch_lock) { 729 spin_lock(&info->lock); 730 set_page_private(subdir, 731 page_private(subdir) - freed); 732 spin_unlock(&info->lock); 733 } else 734 BUG_ON(page_private(subdir) != freed); 735 } 736 offset = 0; 737 } 738done1: 739 shmem_dir_unmap(dir); 740done2: 741 if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) { 742 /* 743 * Call truncate_inode_pages again: racing shmem_unuse_inode 744 * may have swizzled a page in from swap since 745 * truncate_pagecache or generic_delete_inode did it, before we 746 * lowered next_index. Also, though shmem_getpage checks 747 * i_size before adding to cache, no recheck after: so fix the 748 * narrow window there too. 749 */ 750 truncate_inode_pages_range(inode->i_mapping, start, end); 751 } 752 753 spin_lock(&info->lock); 754 info->flags &= ~SHMEM_TRUNCATE; 755 info->swapped -= nr_swaps_freed; 756 if (nr_pages_to_free) 757 shmem_free_blocks(inode, nr_pages_to_free); 758 shmem_recalc_inode(inode); 759 spin_unlock(&info->lock); 760 761 /* 762 * Empty swap vector directory pages to be freed? 763 */ 764 if (!list_empty(&pages_to_free)) { 765 pages_to_free.prev->next = NULL; 766 shmem_free_pages(pages_to_free.next); 767 } 768} 769EXPORT_SYMBOL_GPL(shmem_truncate_range); 770 771static int shmem_setattr(struct dentry *dentry, struct iattr *attr) 772{ 773 struct inode *inode = dentry->d_inode; 774 int error; 775 776 error = inode_change_ok(inode, attr); 777 if (error) 778 return error; 779 780 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 781 loff_t oldsize = inode->i_size; 782 loff_t newsize = attr->ia_size; 783 struct page *page = NULL; 784 785 if (newsize < oldsize) { 786 /* 787 * If truncating down to a partial page, then 788 * if that page is already allocated, hold it 789 * in memory until the truncation is over, so 790 * truncate_partial_page cannot miss it were 791 * it assigned to swap. 792 */ 793 if (newsize & (PAGE_CACHE_SIZE-1)) { 794 (void) shmem_getpage(inode, 795 newsize >> PAGE_CACHE_SHIFT, 796 &page, SGP_READ, NULL); 797 if (page) 798 unlock_page(page); 799 } 800 /* 801 * Reset SHMEM_PAGEIN flag so that shmem_truncate can 802 * detect if any pages might have been added to cache 803 * after truncate_inode_pages. But we needn't bother 804 * if it's being fully truncated to zero-length: the 805 * nrpages check is efficient enough in that case. 806 */ 807 if (newsize) { 808 struct shmem_inode_info *info = SHMEM_I(inode); 809 spin_lock(&info->lock); 810 info->flags &= ~SHMEM_PAGEIN; 811 spin_unlock(&info->lock); 812 } 813 } 814 if (newsize != oldsize) { 815 i_size_write(inode, newsize); 816 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 817 } 818 if (newsize < oldsize) { 819 loff_t holebegin = round_up(newsize, PAGE_SIZE); 820 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); 821 shmem_truncate_range(inode, newsize, (loff_t)-1); 822 /* unmap again to remove racily COWed private pages */ 823 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); 824 } 825 if (page) 826 page_cache_release(page); 827 } 828 829 setattr_copy(inode, attr); 830#ifdef CONFIG_TMPFS_POSIX_ACL 831 if (attr->ia_valid & ATTR_MODE) 832 error = generic_acl_chmod(inode); 833#endif 834 return error; 835} 836 837static void shmem_evict_inode(struct inode *inode) 838{ 839 struct shmem_inode_info *info = SHMEM_I(inode); 840 struct shmem_xattr *xattr, *nxattr; 841 842 if (inode->i_mapping->a_ops == &shmem_aops) { 843 shmem_unacct_size(info->flags, inode->i_size); 844 inode->i_size = 0; 845 shmem_truncate_range(inode, 0, (loff_t)-1); 846 if (!list_empty(&info->swaplist)) { 847 mutex_lock(&shmem_swaplist_mutex); 848 list_del_init(&info->swaplist); 849 mutex_unlock(&shmem_swaplist_mutex); 850 } 851 } 852 853 list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { 854 kfree(xattr->name); 855 kfree(xattr); 856 } 857 BUG_ON(inode->i_blocks); 858 shmem_free_inode(inode->i_sb); 859 end_writeback(inode); 860} 861 862static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) 863{ 864 swp_entry_t *ptr; 865 866 for (ptr = dir; ptr < edir; ptr++) { 867 if (ptr->val == entry.val) 868 return ptr - dir; 869 } 870 return -1; 871} 872 873static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) 874{ 875 struct address_space *mapping; 876 unsigned long idx; 877 unsigned long size; 878 unsigned long limit; 879 unsigned long stage; 880 struct page **dir; 881 struct page *subdir; 882 swp_entry_t *ptr; 883 int offset; 884 int error; 885 886 idx = 0; 887 ptr = info->i_direct; 888 spin_lock(&info->lock); 889 if (!info->swapped) { 890 list_del_init(&info->swaplist); 891 goto lost2; 892 } 893 limit = info->next_index; 894 size = limit; 895 if (size > SHMEM_NR_DIRECT) 896 size = SHMEM_NR_DIRECT; 897 offset = shmem_find_swp(entry, ptr, ptr+size); 898 if (offset >= 0) { 899 shmem_swp_balance_unmap(); 900 goto found; 901 } 902 if (!info->i_indirect) 903 goto lost2; 904 905 dir = shmem_dir_map(info->i_indirect); 906 stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2; 907 908 for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) { 909 if (unlikely(idx == stage)) { 910 shmem_dir_unmap(dir-1); 911 if (cond_resched_lock(&info->lock)) { 912 /* check it has not been truncated */ 913 if (limit > info->next_index) { 914 limit = info->next_index; 915 if (idx >= limit) 916 goto lost2; 917 } 918 } 919 dir = shmem_dir_map(info->i_indirect) + 920 ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; 921 while (!*dir) { 922 dir++; 923 idx += ENTRIES_PER_PAGEPAGE; 924 if (idx >= limit) 925 goto lost1; 926 } 927 stage = idx + ENTRIES_PER_PAGEPAGE; 928 subdir = *dir; 929 shmem_dir_unmap(dir); 930 dir = shmem_dir_map(subdir); 931 } 932 subdir = *dir; 933 if (subdir && page_private(subdir)) { 934 ptr = shmem_swp_map(subdir); 935 size = limit - idx; 936 if (size > ENTRIES_PER_PAGE) 937 size = ENTRIES_PER_PAGE; 938 offset = shmem_find_swp(entry, ptr, ptr+size); 939 shmem_swp_unmap(ptr); 940 if (offset >= 0) { 941 shmem_dir_unmap(dir); 942 ptr = shmem_swp_map(subdir); 943 goto found; 944 } 945 } 946 } 947lost1: 948 shmem_dir_unmap(dir-1); 949lost2: 950 spin_unlock(&info->lock); 951 return 0; 952found: 953 idx += offset; 954 ptr += offset; 955 956 /* 957 * Move _head_ to start search for next from here. 958 * But be careful: shmem_evict_inode checks list_empty without taking 959 * mutex, and there's an instant in list_move_tail when info->swaplist 960 * would appear empty, if it were the only one on shmem_swaplist. We 961 * could avoid doing it if inode NULL; or use this minor optimization. 962 */ 963 if (shmem_swaplist.next != &info->swaplist) 964 list_move_tail(&shmem_swaplist, &info->swaplist); 965 966 /* 967 * We rely on shmem_swaplist_mutex, not only to protect the swaplist, 968 * but also to hold up shmem_evict_inode(): so inode cannot be freed 969 * beneath us (pagelock doesn't help until the page is in pagecache). 970 */ 971 mapping = info->vfs_inode.i_mapping; 972 error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); 973 /* which does mem_cgroup_uncharge_cache_page on error */ 974 975 if (error != -ENOMEM) { 976 delete_from_swap_cache(page); 977 set_page_dirty(page); 978 info->flags |= SHMEM_PAGEIN; 979 shmem_swp_set(info, ptr, 0); 980 swap_free(entry); 981 error = 1; /* not an error, but entry was found */ 982 } 983 shmem_swp_unmap(ptr); 984 spin_unlock(&info->lock); 985 return error; 986} 987 988/* 989 * shmem_unuse() search for an eventually swapped out shmem page. 990 */ 991int shmem_unuse(swp_entry_t entry, struct page *page) 992{ 993 struct list_head *p, *next; 994 struct shmem_inode_info *info; 995 int found = 0; 996 int error; 997 998 /* 999 * Charge page using GFP_KERNEL while we can wait, before taking 1000 * the shmem_swaplist_mutex which might hold up shmem_writepage(). 1001 * Charged back to the user (not to caller) when swap account is used. 1002 * add_to_page_cache() will be called with GFP_NOWAIT. 1003 */ 1004 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); 1005 if (error) 1006 goto out; 1007 /* 1008 * Try to preload while we can wait, to not make a habit of 1009 * draining atomic reserves; but don't latch on to this cpu, 1010 * it's okay if sometimes we get rescheduled after this. 1011 */ 1012 error = radix_tree_preload(GFP_KERNEL); 1013 if (error) 1014 goto uncharge; 1015 radix_tree_preload_end(); 1016 1017 mutex_lock(&shmem_swaplist_mutex); 1018 list_for_each_safe(p, next, &shmem_swaplist) { 1019 info = list_entry(p, struct shmem_inode_info, swaplist); 1020 found = shmem_unuse_inode(info, entry, page); 1021 cond_resched(); 1022 if (found) 1023 break; 1024 } 1025 mutex_unlock(&shmem_swaplist_mutex); 1026 1027uncharge: 1028 if (!found) 1029 mem_cgroup_uncharge_cache_page(page); 1030 if (found < 0) 1031 error = found; 1032out: 1033 unlock_page(page); 1034 page_cache_release(page); 1035 return error; 1036} 1037 1038/* 1039 * Move the page from the page cache to the swap cache. 1040 */ 1041static int shmem_writepage(struct page *page, struct writeback_control *wbc) 1042{ 1043 struct shmem_inode_info *info; 1044 swp_entry_t *entry, swap; 1045 struct address_space *mapping; 1046 unsigned long index; 1047 struct inode *inode; 1048 1049 BUG_ON(!PageLocked(page)); 1050 mapping = page->mapping; 1051 index = page->index; 1052 inode = mapping->host; 1053 info = SHMEM_I(inode); 1054 if (info->flags & VM_LOCKED) 1055 goto redirty; 1056 if (!total_swap_pages) 1057 goto redirty; 1058 1059 /* 1060 * shmem_backing_dev_info's capabilities prevent regular writeback or 1061 * sync from ever calling shmem_writepage; but a stacking filesystem 1062 * might use ->writepage of its underlying filesystem, in which case 1063 * tmpfs should write out to swap only in response to memory pressure, 1064 * and not for the writeback threads or sync. 1065 */ 1066 if (!wbc->for_reclaim) { 1067 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ 1068 goto redirty; 1069 } 1070 swap = get_swap_page(); 1071 if (!swap.val) 1072 goto redirty; 1073 1074 /* 1075 * Add inode to shmem_unuse()'s list of swapped-out inodes, 1076 * if it's not already there. Do it now because we cannot take 1077 * mutex while holding spinlock, and must do so before the page 1078 * is moved to swap cache, when its pagelock no longer protects 1079 * the inode from eviction. But don't unlock the mutex until 1080 * we've taken the spinlock, because shmem_unuse_inode() will 1081 * prune a !swapped inode from the swaplist under both locks. 1082 */ 1083 mutex_lock(&shmem_swaplist_mutex); 1084 if (list_empty(&info->swaplist)) 1085 list_add_tail(&info->swaplist, &shmem_swaplist); 1086 1087 spin_lock(&info->lock); 1088 mutex_unlock(&shmem_swaplist_mutex); 1089 1090 if (index >= info->next_index) { 1091 BUG_ON(!(info->flags & SHMEM_TRUNCATE)); 1092 goto unlock; 1093 } 1094 entry = shmem_swp_entry(info, index, NULL); 1095 if (entry->val) { 1096 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ 1097 free_swap_and_cache(*entry); 1098 shmem_swp_set(info, entry, 0); 1099 } 1100 shmem_recalc_inode(inode); 1101 1102 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 1103 delete_from_page_cache(page); 1104 shmem_swp_set(info, entry, swap.val); 1105 shmem_swp_unmap(entry); 1106 swap_shmem_alloc(swap); 1107 spin_unlock(&info->lock); 1108 BUG_ON(page_mapped(page)); 1109 swap_writepage(page, wbc); 1110 return 0; 1111 } 1112 1113 shmem_swp_unmap(entry); 1114unlock: 1115 spin_unlock(&info->lock); 1116 /* 1117 * add_to_swap_cache() doesn't return -EEXIST, so we can safely 1118 * clear SWAP_HAS_CACHE flag. 1119 */ 1120 swapcache_free(swap, NULL); 1121redirty: 1122 set_page_dirty(page); 1123 if (wbc->for_reclaim) 1124 return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */ 1125 unlock_page(page); 1126 return 0; 1127} 1128 1129#ifdef CONFIG_NUMA 1130#ifdef CONFIG_TMPFS 1131static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) 1132{ 1133 char buffer[64]; 1134 1135 if (!mpol || mpol->mode == MPOL_DEFAULT) 1136 return; /* show nothing */ 1137 1138 mpol_to_str(buffer, sizeof(buffer), mpol, 1); 1139 1140 seq_printf(seq, ",mpol=%s", buffer); 1141} 1142 1143static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 1144{ 1145 struct mempolicy *mpol = NULL; 1146 if (sbinfo->mpol) { 1147 spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ 1148 mpol = sbinfo->mpol; 1149 mpol_get(mpol); 1150 spin_unlock(&sbinfo->stat_lock); 1151 } 1152 return mpol; 1153} 1154#endif /* CONFIG_TMPFS */ 1155 1156static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, 1157 struct shmem_inode_info *info, unsigned long idx) 1158{ 1159 struct mempolicy mpol, *spol; 1160 struct vm_area_struct pvma; 1161 struct page *page; 1162 1163 spol = mpol_cond_copy(&mpol, 1164 mpol_shared_policy_lookup(&info->policy, idx)); 1165 1166 /* Create a pseudo vma that just contains the policy */ 1167 pvma.vm_start = 0; 1168 pvma.vm_pgoff = idx; 1169 pvma.vm_ops = NULL; 1170 pvma.vm_policy = spol; 1171 page = swapin_readahead(entry, gfp, &pvma, 0); 1172 return page; 1173} 1174 1175static struct page *shmem_alloc_page(gfp_t gfp, 1176 struct shmem_inode_info *info, unsigned long idx) 1177{ 1178 struct vm_area_struct pvma; 1179 1180 /* Create a pseudo vma that just contains the policy */ 1181 pvma.vm_start = 0; 1182 pvma.vm_pgoff = idx; 1183 pvma.vm_ops = NULL; 1184 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); 1185 1186 /* 1187 * alloc_page_vma() will drop the shared policy reference 1188 */ 1189 return alloc_page_vma(gfp, &pvma, 0); 1190} 1191#else /* !CONFIG_NUMA */ 1192#ifdef CONFIG_TMPFS 1193static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p) 1194{ 1195} 1196#endif /* CONFIG_TMPFS */ 1197 1198static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, 1199 struct shmem_inode_info *info, unsigned long idx) 1200{ 1201 return swapin_readahead(entry, gfp, NULL, 0); 1202} 1203 1204static inline struct page *shmem_alloc_page(gfp_t gfp, 1205 struct shmem_inode_info *info, unsigned long idx) 1206{ 1207 return alloc_page(gfp); 1208} 1209#endif /* CONFIG_NUMA */ 1210 1211#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS) 1212static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) 1213{ 1214 return NULL; 1215} 1216#endif 1217 1218/* 1219 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate 1220 * 1221 * If we allocate a new one we do not mark it dirty. That's up to the 1222 * vm. If we swap it in we mark it dirty since we also free the swap 1223 * entry since a page cannot live in both the swap and page cache 1224 */ 1225static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx, 1226 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) 1227{ 1228 struct address_space *mapping = inode->i_mapping; 1229 struct shmem_inode_info *info = SHMEM_I(inode); 1230 struct shmem_sb_info *sbinfo; 1231 struct page *page; 1232 struct page *prealloc_page = NULL; 1233 swp_entry_t *entry; 1234 swp_entry_t swap; 1235 int error; 1236 int ret; 1237 1238 if (idx >= SHMEM_MAX_INDEX) 1239 return -EFBIG; 1240repeat: 1241 page = find_lock_page(mapping, idx); 1242 if (page) { 1243 /* 1244 * Once we can get the page lock, it must be uptodate: 1245 * if there were an error in reading back from swap, 1246 * the page would not be inserted into the filecache. 1247 */ 1248 BUG_ON(!PageUptodate(page)); 1249 goto done; 1250 } 1251 1252 /* 1253 * Try to preload while we can wait, to not make a habit of 1254 * draining atomic reserves; but don't latch on to this cpu. 1255 */ 1256 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); 1257 if (error) 1258 goto out; 1259 radix_tree_preload_end(); 1260 1261 if (sgp != SGP_READ && !prealloc_page) { 1262 prealloc_page = shmem_alloc_page(gfp, info, idx); 1263 if (prealloc_page) { 1264 SetPageSwapBacked(prealloc_page); 1265 if (mem_cgroup_cache_charge(prealloc_page, 1266 current->mm, GFP_KERNEL)) { 1267 page_cache_release(prealloc_page); 1268 prealloc_page = NULL; 1269 } 1270 } 1271 } 1272 1273 spin_lock(&info->lock); 1274 shmem_recalc_inode(inode); 1275 entry = shmem_swp_alloc(info, idx, sgp, gfp); 1276 if (IS_ERR(entry)) { 1277 spin_unlock(&info->lock); 1278 error = PTR_ERR(entry); 1279 goto out; 1280 } 1281 swap = *entry; 1282 1283 if (swap.val) { 1284 /* Look it up and read it in.. */ 1285 page = lookup_swap_cache(swap); 1286 if (!page) { 1287 shmem_swp_unmap(entry); 1288 spin_unlock(&info->lock); 1289 /* here we actually do the io */ 1290 if (fault_type) 1291 *fault_type |= VM_FAULT_MAJOR; 1292 page = shmem_swapin(swap, gfp, info, idx); 1293 if (!page) { 1294 spin_lock(&info->lock); 1295 entry = shmem_swp_alloc(info, idx, sgp, gfp); 1296 if (IS_ERR(entry)) 1297 error = PTR_ERR(entry); 1298 else { 1299 if (entry->val == swap.val) 1300 error = -ENOMEM; 1301 shmem_swp_unmap(entry); 1302 } 1303 spin_unlock(&info->lock); 1304 if (error) 1305 goto out; 1306 goto repeat; 1307 } 1308 wait_on_page_locked(page); 1309 page_cache_release(page); 1310 goto repeat; 1311 } 1312 1313 /* We have to do this with page locked to prevent races */ 1314 if (!trylock_page(page)) { 1315 shmem_swp_unmap(entry); 1316 spin_unlock(&info->lock); 1317 wait_on_page_locked(page); 1318 page_cache_release(page); 1319 goto repeat; 1320 } 1321 if (PageWriteback(page)) { 1322 shmem_swp_unmap(entry); 1323 spin_unlock(&info->lock); 1324 wait_on_page_writeback(page); 1325 unlock_page(page); 1326 page_cache_release(page); 1327 goto repeat; 1328 } 1329 if (!PageUptodate(page)) { 1330 shmem_swp_unmap(entry); 1331 spin_unlock(&info->lock); 1332 unlock_page(page); 1333 page_cache_release(page); 1334 error = -EIO; 1335 goto out; 1336 } 1337 1338 error = add_to_page_cache_locked(page, mapping, 1339 idx, GFP_NOWAIT); 1340 if (error) { 1341 shmem_swp_unmap(entry); 1342 spin_unlock(&info->lock); 1343 if (error == -ENOMEM) { 1344 /* 1345 * reclaim from proper memory cgroup and 1346 * call memcg's OOM if needed. 1347 */ 1348 error = mem_cgroup_shmem_charge_fallback( 1349 page, current->mm, gfp); 1350 if (error) { 1351 unlock_page(page); 1352 page_cache_release(page); 1353 goto out; 1354 } 1355 } 1356 unlock_page(page); 1357 page_cache_release(page); 1358 goto repeat; 1359 } 1360 1361 info->flags |= SHMEM_PAGEIN; 1362 shmem_swp_set(info, entry, 0); 1363 shmem_swp_unmap(entry); 1364 delete_from_swap_cache(page); 1365 spin_unlock(&info->lock); 1366 set_page_dirty(page); 1367 swap_free(swap); 1368 1369 } else if (sgp == SGP_READ) { 1370 shmem_swp_unmap(entry); 1371 page = find_get_page(mapping, idx); 1372 if (page && !trylock_page(page)) { 1373 spin_unlock(&info->lock); 1374 wait_on_page_locked(page); 1375 page_cache_release(page); 1376 goto repeat; 1377 } 1378 spin_unlock(&info->lock); 1379 1380 } else if (prealloc_page) { 1381 shmem_swp_unmap(entry); 1382 sbinfo = SHMEM_SB(inode->i_sb); 1383 if (sbinfo->max_blocks) { 1384 if (percpu_counter_compare(&sbinfo->used_blocks, 1385 sbinfo->max_blocks) >= 0 || 1386 shmem_acct_block(info->flags)) 1387 goto nospace; 1388 percpu_counter_inc(&sbinfo->used_blocks); 1389 inode->i_blocks += BLOCKS_PER_PAGE; 1390 } else if (shmem_acct_block(info->flags)) 1391 goto nospace; 1392 1393 page = prealloc_page; 1394 prealloc_page = NULL; 1395 1396 entry = shmem_swp_alloc(info, idx, sgp, gfp); 1397 if (IS_ERR(entry)) 1398 error = PTR_ERR(entry); 1399 else { 1400 swap = *entry; 1401 shmem_swp_unmap(entry); 1402 } 1403 ret = error || swap.val; 1404 if (ret) 1405 mem_cgroup_uncharge_cache_page(page); 1406 else 1407 ret = add_to_page_cache_lru(page, mapping, 1408 idx, GFP_NOWAIT); 1409 /* 1410 * At add_to_page_cache_lru() failure, 1411 * uncharge will be done automatically. 1412 */ 1413 if (ret) { 1414 shmem_unacct_blocks(info->flags, 1); 1415 shmem_free_blocks(inode, 1); 1416 spin_unlock(&info->lock); 1417 page_cache_release(page); 1418 if (error) 1419 goto out; 1420 goto repeat; 1421 } 1422 1423 info->flags |= SHMEM_PAGEIN; 1424 info->alloced++; 1425 spin_unlock(&info->lock); 1426 clear_highpage(page); 1427 flush_dcache_page(page); 1428 SetPageUptodate(page); 1429 if (sgp == SGP_DIRTY) 1430 set_page_dirty(page); 1431 1432 } else { 1433 spin_unlock(&info->lock); 1434 error = -ENOMEM; 1435 goto out; 1436 } 1437done: 1438 *pagep = page; 1439 error = 0; 1440out: 1441 if (prealloc_page) { 1442 mem_cgroup_uncharge_cache_page(prealloc_page); 1443 page_cache_release(prealloc_page); 1444 } 1445 return error; 1446 1447nospace: 1448 /* 1449 * Perhaps the page was brought in from swap between find_lock_page 1450 * and taking info->lock? We allow for that at add_to_page_cache_lru, 1451 * but must also avoid reporting a spurious ENOSPC while working on a 1452 * full tmpfs. 1453 */ 1454 page = find_get_page(mapping, idx); 1455 spin_unlock(&info->lock); 1456 if (page) { 1457 page_cache_release(page); 1458 goto repeat; 1459 } 1460 error = -ENOSPC; 1461 goto out; 1462} 1463 1464static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1465{ 1466 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 1467 int error; 1468 int ret = VM_FAULT_LOCKED; 1469 1470 if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) 1471 return VM_FAULT_SIGBUS; 1472 1473 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); 1474 if (error) 1475 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1476 1477 if (ret & VM_FAULT_MAJOR) { 1478 count_vm_event(PGMAJFAULT); 1479 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 1480 } 1481 return ret; 1482} 1483 1484#ifdef CONFIG_NUMA 1485static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) 1486{ 1487 struct inode *i = vma->vm_file->f_path.dentry->d_inode; 1488 return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); 1489} 1490 1491static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, 1492 unsigned long addr) 1493{ 1494 struct inode *i = vma->vm_file->f_path.dentry->d_inode; 1495 unsigned long idx; 1496 1497 idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 1498 return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx); 1499} 1500#endif 1501 1502int shmem_lock(struct file *file, int lock, struct user_struct *user) 1503{ 1504 struct inode *inode = file->f_path.dentry->d_inode; 1505 struct shmem_inode_info *info = SHMEM_I(inode); 1506 int retval = -ENOMEM; 1507 1508 spin_lock(&info->lock); 1509 if (lock && !(info->flags & VM_LOCKED)) { 1510 if (!user_shm_lock(inode->i_size, user)) 1511 goto out_nomem; 1512 info->flags |= VM_LOCKED; 1513 mapping_set_unevictable(file->f_mapping); 1514 } 1515 if (!lock && (info->flags & VM_LOCKED) && user) { 1516 user_shm_unlock(inode->i_size, user); 1517 info->flags &= ~VM_LOCKED; 1518 mapping_clear_unevictable(file->f_mapping); 1519 scan_mapping_unevictable_pages(file->f_mapping); 1520 } 1521 retval = 0; 1522 1523out_nomem: 1524 spin_unlock(&info->lock); 1525 return retval; 1526} 1527 1528static int shmem_mmap(struct file *file, struct vm_area_struct *vma) 1529{ 1530 file_accessed(file); 1531 vma->vm_ops = &shmem_vm_ops; 1532 vma->vm_flags |= VM_CAN_NONLINEAR; 1533 return 0; 1534} 1535 1536static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, 1537 int mode, dev_t dev, unsigned long flags) 1538{ 1539 struct inode *inode; 1540 struct shmem_inode_info *info; 1541 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 1542 1543 if (shmem_reserve_inode(sb)) 1544 return NULL; 1545 1546 inode = new_inode(sb); 1547 if (inode) { 1548 inode->i_ino = get_next_ino(); 1549 inode_init_owner(inode, dir, mode); 1550 inode->i_blocks = 0; 1551 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; 1552 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1553 inode->i_generation = get_seconds(); 1554 info = SHMEM_I(inode); 1555 memset(info, 0, (char *)inode - (char *)info); 1556 spin_lock_init(&info->lock); 1557 info->flags = flags & VM_NORESERVE; 1558 INIT_LIST_HEAD(&info->swaplist); 1559 INIT_LIST_HEAD(&info->xattr_list); 1560 cache_no_acl(inode); 1561 1562 switch (mode & S_IFMT) { 1563 default: 1564 inode->i_op = &shmem_special_inode_operations; 1565 init_special_inode(inode, mode, dev); 1566 break; 1567 case S_IFREG: 1568 inode->i_mapping->a_ops = &shmem_aops; 1569 inode->i_op = &shmem_inode_operations; 1570 inode->i_fop = &shmem_file_operations; 1571 mpol_shared_policy_init(&info->policy, 1572 shmem_get_sbmpol(sbinfo)); 1573 break; 1574 case S_IFDIR: 1575 inc_nlink(inode); 1576 /* Some things misbehave if size == 0 on a directory */ 1577 inode->i_size = 2 * BOGO_DIRENT_SIZE; 1578 inode->i_op = &shmem_dir_inode_operations; 1579 inode->i_fop = &simple_dir_operations; 1580 break; 1581 case S_IFLNK: 1582 /* 1583 * Must not load anything in the rbtree, 1584 * mpol_free_shared_policy will not be called. 1585 */ 1586 mpol_shared_policy_init(&info->policy, NULL); 1587 break; 1588 } 1589 } else 1590 shmem_free_inode(sb); 1591 return inode; 1592} 1593 1594#ifdef CONFIG_TMPFS 1595static const struct inode_operations shmem_symlink_inode_operations; 1596static const struct inode_operations shmem_symlink_inline_operations; 1597 1598static int 1599shmem_write_begin(struct file *file, struct address_space *mapping, 1600 loff_t pos, unsigned len, unsigned flags, 1601 struct page **pagep, void **fsdata) 1602{ 1603 struct inode *inode = mapping->host; 1604 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1605 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); 1606} 1607 1608static int 1609shmem_write_end(struct file *file, struct address_space *mapping, 1610 loff_t pos, unsigned len, unsigned copied, 1611 struct page *page, void *fsdata) 1612{ 1613 struct inode *inode = mapping->host; 1614 1615 if (pos + copied > inode->i_size) 1616 i_size_write(inode, pos + copied); 1617 1618 set_page_dirty(page); 1619 unlock_page(page); 1620 page_cache_release(page); 1621 1622 return copied; 1623} 1624 1625static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) 1626{ 1627 struct inode *inode = filp->f_path.dentry->d_inode; 1628 struct address_space *mapping = inode->i_mapping; 1629 unsigned long index, offset; 1630 enum sgp_type sgp = SGP_READ; 1631 1632 /* 1633 * Might this read be for a stacking filesystem? Then when reading 1634 * holes of a sparse file, we actually need to allocate those pages, 1635 * and even mark them dirty, so it cannot exceed the max_blocks limit. 1636 */ 1637 if (segment_eq(get_fs(), KERNEL_DS)) 1638 sgp = SGP_DIRTY; 1639 1640 index = *ppos >> PAGE_CACHE_SHIFT; 1641 offset = *ppos & ~PAGE_CACHE_MASK; 1642 1643 for (;;) { 1644 struct page *page = NULL; 1645 unsigned long end_index, nr, ret; 1646 loff_t i_size = i_size_read(inode); 1647 1648 end_index = i_size >> PAGE_CACHE_SHIFT; 1649 if (index > end_index) 1650 break; 1651 if (index == end_index) { 1652 nr = i_size & ~PAGE_CACHE_MASK; 1653 if (nr <= offset) 1654 break; 1655 } 1656 1657 desc->error = shmem_getpage(inode, index, &page, sgp, NULL); 1658 if (desc->error) { 1659 if (desc->error == -EINVAL) 1660 desc->error = 0; 1661 break; 1662 } 1663 if (page) 1664 unlock_page(page); 1665 1666 /* 1667 * We must evaluate after, since reads (unlike writes) 1668 * are called without i_mutex protection against truncate 1669 */ 1670 nr = PAGE_CACHE_SIZE; 1671 i_size = i_size_read(inode); 1672 end_index = i_size >> PAGE_CACHE_SHIFT; 1673 if (index == end_index) { 1674 nr = i_size & ~PAGE_CACHE_MASK; 1675 if (nr <= offset) { 1676 if (page) 1677 page_cache_release(page); 1678 break; 1679 } 1680 } 1681 nr -= offset; 1682 1683 if (page) { 1684 /* 1685 * If users can be writing to this page using arbitrary 1686 * virtual addresses, take care about potential aliasing 1687 * before reading the page on the kernel side. 1688 */ 1689 if (mapping_writably_mapped(mapping)) 1690 flush_dcache_page(page); 1691 /* 1692 * Mark the page accessed if we read the beginning. 1693 */ 1694 if (!offset) 1695 mark_page_accessed(page); 1696 } else { 1697 page = ZERO_PAGE(0); 1698 page_cache_get(page); 1699 } 1700 1701 /* 1702 * Ok, we have the page, and it's up-to-date, so 1703 * now we can copy it to user space... 1704 * 1705 * The actor routine returns how many bytes were actually used.. 1706 * NOTE! This may not be the same as how much of a user buffer 1707 * we filled up (we may be padding etc), so we can only update 1708 * "pos" here (the actor routine has to update the user buffer 1709 * pointers and the remaining count). 1710 */ 1711 ret = actor(desc, page, offset, nr); 1712 offset += ret; 1713 index += offset >> PAGE_CACHE_SHIFT; 1714 offset &= ~PAGE_CACHE_MASK; 1715 1716 page_cache_release(page); 1717 if (ret != nr || !desc->count) 1718 break; 1719 1720 cond_resched(); 1721 } 1722 1723 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; 1724 file_accessed(filp); 1725} 1726 1727static ssize_t shmem_file_aio_read(struct kiocb *iocb, 1728 const struct iovec *iov, unsigned long nr_segs, loff_t pos) 1729{ 1730 struct file *filp = iocb->ki_filp; 1731 ssize_t retval; 1732 unsigned long seg; 1733 size_t count; 1734 loff_t *ppos = &iocb->ki_pos; 1735 1736 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); 1737 if (retval) 1738 return retval; 1739 1740 for (seg = 0; seg < nr_segs; seg++) { 1741 read_descriptor_t desc; 1742 1743 desc.written = 0; 1744 desc.arg.buf = iov[seg].iov_base; 1745 desc.count = iov[seg].iov_len; 1746 if (desc.count == 0) 1747 continue; 1748 desc.error = 0; 1749 do_shmem_file_read(filp, ppos, &desc, file_read_actor); 1750 retval += desc.written; 1751 if (desc.error) { 1752 retval = retval ?: desc.error; 1753 break; 1754 } 1755 if (desc.count > 0) 1756 break; 1757 } 1758 return retval; 1759} 1760 1761static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, 1762 struct pipe_inode_info *pipe, size_t len, 1763 unsigned int flags) 1764{ 1765 struct address_space *mapping = in->f_mapping; 1766 struct inode *inode = mapping->host; 1767 unsigned int loff, nr_pages, req_pages; 1768 struct page *pages[PIPE_DEF_BUFFERS]; 1769 struct partial_page partial[PIPE_DEF_BUFFERS]; 1770 struct page *page; 1771 pgoff_t index, end_index; 1772 loff_t isize, left; 1773 int error, page_nr; 1774 struct splice_pipe_desc spd = { 1775 .pages = pages, 1776 .partial = partial, 1777 .flags = flags, 1778 .ops = &page_cache_pipe_buf_ops, 1779 .spd_release = spd_release_page, 1780 }; 1781 1782 isize = i_size_read(inode); 1783 if (unlikely(*ppos >= isize)) 1784 return 0; 1785 1786 left = isize - *ppos; 1787 if (unlikely(left < len)) 1788 len = left; 1789 1790 if (splice_grow_spd(pipe, &spd)) 1791 return -ENOMEM; 1792 1793 index = *ppos >> PAGE_CACHE_SHIFT; 1794 loff = *ppos & ~PAGE_CACHE_MASK; 1795 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1796 nr_pages = min(req_pages, pipe->buffers); 1797 1798 spd.nr_pages = find_get_pages_contig(mapping, index, 1799 nr_pages, spd.pages); 1800 index += spd.nr_pages; 1801 error = 0; 1802 1803 while (spd.nr_pages < nr_pages) { 1804 error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); 1805 if (error) 1806 break; 1807 unlock_page(page); 1808 spd.pages[spd.nr_pages++] = page; 1809 index++; 1810 } 1811 1812 index = *ppos >> PAGE_CACHE_SHIFT; 1813 nr_pages = spd.nr_pages; 1814 spd.nr_pages = 0; 1815 1816 for (page_nr = 0; page_nr < nr_pages; page_nr++) { 1817 unsigned int this_len; 1818 1819 if (!len) 1820 break; 1821 1822 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 1823 page = spd.pages[page_nr]; 1824 1825 if (!PageUptodate(page) || page->mapping != mapping) { 1826 error = shmem_getpage(inode, index, &page, 1827 SGP_CACHE, NULL); 1828 if (error) 1829 break; 1830 unlock_page(page); 1831 page_cache_release(spd.pages[page_nr]); 1832 spd.pages[page_nr] = page; 1833 } 1834 1835 isize = i_size_read(inode); 1836 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 1837 if (unlikely(!isize || index > end_index)) 1838 break; 1839 1840 if (end_index == index) { 1841 unsigned int plen; 1842 1843 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; 1844 if (plen <= loff) 1845 break; 1846 1847 this_len = min(this_len, plen - loff); 1848 len = this_len; 1849 } 1850 1851 spd.partial[page_nr].offset = loff; 1852 spd.partial[page_nr].len = this_len; 1853 len -= this_len; 1854 loff = 0; 1855 spd.nr_pages++; 1856 index++; 1857 } 1858 1859 while (page_nr < nr_pages) 1860 page_cache_release(spd.pages[page_nr++]); 1861 1862 if (spd.nr_pages) 1863 error = splice_to_pipe(pipe, &spd); 1864 1865 splice_shrink_spd(pipe, &spd); 1866 1867 if (error > 0) { 1868 *ppos += error; 1869 file_accessed(in); 1870 } 1871 return error; 1872} 1873 1874static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 1875{ 1876 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 1877 1878 buf->f_type = TMPFS_MAGIC; 1879 buf->f_bsize = PAGE_CACHE_SIZE; 1880 buf->f_namelen = NAME_MAX; 1881 if (sbinfo->max_blocks) { 1882 buf->f_blocks = sbinfo->max_blocks; 1883 buf->f_bavail = buf->f_bfree = 1884 sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); 1885 } 1886 if (sbinfo->max_inodes) { 1887 buf->f_files = sbinfo->max_inodes; 1888 buf->f_ffree = sbinfo->free_inodes; 1889 } 1890 /* else leave those fields 0 like simple_statfs */ 1891 return 0; 1892} 1893 1894/* 1895 * File creation. Allocate an inode, and we're done.. 1896 */ 1897static int 1898shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) 1899{ 1900 struct inode *inode; 1901 int error = -ENOSPC; 1902 1903 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); 1904 if (inode) { 1905 error = security_inode_init_security(inode, dir, 1906 &dentry->d_name, NULL, 1907 NULL, NULL); 1908 if (error) { 1909 if (error != -EOPNOTSUPP) { 1910 iput(inode); 1911 return error; 1912 } 1913 } 1914#ifdef CONFIG_TMPFS_POSIX_ACL 1915 error = generic_acl_init(inode, dir); 1916 if (error) { 1917 iput(inode); 1918 return error; 1919 } 1920#else 1921 error = 0; 1922#endif 1923 dir->i_size += BOGO_DIRENT_SIZE; 1924 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1925 d_instantiate(dentry, inode); 1926 dget(dentry); /* Extra count - pin the dentry in core */ 1927 } 1928 return error; 1929} 1930 1931static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode) 1932{ 1933 int error; 1934 1935 if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) 1936 return error; 1937 inc_nlink(dir); 1938 return 0; 1939} 1940 1941static int shmem_create(struct inode *dir, struct dentry *dentry, int mode, 1942 struct nameidata *nd) 1943{ 1944 return shmem_mknod(dir, dentry, mode | S_IFREG, 0); 1945} 1946 1947/* 1948 * Link a file.. 1949 */ 1950static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 1951{ 1952 struct inode *inode = old_dentry->d_inode; 1953 int ret; 1954 1955 /* 1956 * No ordinary (disk based) filesystem counts links as inodes; 1957 * but each new link needs a new dentry, pinning lowmem, and 1958 * tmpfs dentries cannot be pruned until they are unlinked. 1959 */ 1960 ret = shmem_reserve_inode(inode->i_sb); 1961 if (ret) 1962 goto out; 1963 1964 dir->i_size += BOGO_DIRENT_SIZE; 1965 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1966 inc_nlink(inode); 1967 ihold(inode); /* New dentry reference */ 1968 dget(dentry); /* Extra pinning count for the created dentry */ 1969 d_instantiate(dentry, inode); 1970out: 1971 return ret; 1972} 1973 1974static int shmem_unlink(struct inode *dir, struct dentry *dentry) 1975{ 1976 struct inode *inode = dentry->d_inode; 1977 1978 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) 1979 shmem_free_inode(inode->i_sb); 1980 1981 dir->i_size -= BOGO_DIRENT_SIZE; 1982 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1983 drop_nlink(inode); 1984 dput(dentry); /* Undo the count from "create" - this does all the work */ 1985 return 0; 1986} 1987 1988static int shmem_rmdir(struct inode *dir, struct dentry *dentry) 1989{ 1990 if (!simple_empty(dentry)) 1991 return -ENOTEMPTY; 1992 1993 drop_nlink(dentry->d_inode); 1994 drop_nlink(dir); 1995 return shmem_unlink(dir, dentry); 1996} 1997 1998/* 1999 * The VFS layer already does all the dentry stuff for rename, 2000 * we just have to decrement the usage count for the target if 2001 * it exists so that the VFS layer correctly free's it when it 2002 * gets overwritten. 2003 */ 2004static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) 2005{ 2006 struct inode *inode = old_dentry->d_inode; 2007 int they_are_dirs = S_ISDIR(inode->i_mode); 2008 2009 if (!simple_empty(new_dentry)) 2010 return -ENOTEMPTY; 2011 2012 if (new_dentry->d_inode) { 2013 (void) shmem_unlink(new_dir, new_dentry); 2014 if (they_are_dirs) 2015 drop_nlink(old_dir); 2016 } else if (they_are_dirs) { 2017 drop_nlink(old_dir); 2018 inc_nlink(new_dir); 2019 } 2020 2021 old_dir->i_size -= BOGO_DIRENT_SIZE; 2022 new_dir->i_size += BOGO_DIRENT_SIZE; 2023 old_dir->i_ctime = old_dir->i_mtime = 2024 new_dir->i_ctime = new_dir->i_mtime = 2025 inode->i_ctime = CURRENT_TIME; 2026 return 0; 2027} 2028 2029static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname) 2030{ 2031 int error; 2032 int len; 2033 struct inode *inode; 2034 struct page *page; 2035 char *kaddr; 2036 struct shmem_inode_info *info; 2037 2038 len = strlen(symname) + 1; 2039 if (len > PAGE_CACHE_SIZE) 2040 return -ENAMETOOLONG; 2041 2042 inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); 2043 if (!inode) 2044 return -ENOSPC; 2045 2046 error = security_inode_init_security(inode, dir, &dentry->d_name, NULL, 2047 NULL, NULL); 2048 if (error) { 2049 if (error != -EOPNOTSUPP) { 2050 iput(inode); 2051 return error; 2052 } 2053 error = 0; 2054 } 2055 2056 info = SHMEM_I(inode); 2057 inode->i_size = len-1; 2058 if (len <= SHMEM_SYMLINK_INLINE_LEN) { 2059 /* do it inline */ 2060 memcpy(info->inline_symlink, symname, len); 2061 inode->i_op = &shmem_symlink_inline_operations; 2062 } else { 2063 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); 2064 if (error) { 2065 iput(inode); 2066 return error; 2067 } 2068 inode->i_mapping->a_ops = &shmem_aops; 2069 inode->i_op = &shmem_symlink_inode_operations; 2070 kaddr = kmap_atomic(page, KM_USER0); 2071 memcpy(kaddr, symname, len); 2072 kunmap_atomic(kaddr, KM_USER0); 2073 set_page_dirty(page); 2074 unlock_page(page); 2075 page_cache_release(page); 2076 } 2077 dir->i_size += BOGO_DIRENT_SIZE; 2078 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 2079 d_instantiate(dentry, inode); 2080 dget(dentry); 2081 return 0; 2082} 2083 2084static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) 2085{ 2086 nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink); 2087 return NULL; 2088} 2089 2090static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) 2091{ 2092 struct page *page = NULL; 2093 int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); 2094 nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); 2095 if (page) 2096 unlock_page(page); 2097 return page; 2098} 2099 2100static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) 2101{ 2102 if (!IS_ERR(nd_get_link(nd))) { 2103 struct page *page = cookie; 2104 kunmap(page); 2105 mark_page_accessed(page); 2106 page_cache_release(page); 2107 } 2108} 2109 2110#ifdef CONFIG_TMPFS_XATTR 2111/* 2112 * Superblocks without xattr inode operations may get some security.* xattr 2113 * support from the LSM "for free". As soon as we have any other xattrs 2114 * like ACLs, we also need to implement the security.* handlers at 2115 * filesystem level, though. 2116 */ 2117 2118static int shmem_xattr_get(struct dentry *dentry, const char *name, 2119 void *buffer, size_t size) 2120{ 2121 struct shmem_inode_info *info; 2122 struct shmem_xattr *xattr; 2123 int ret = -ENODATA; 2124 2125 info = SHMEM_I(dentry->d_inode); 2126 2127 spin_lock(&info->lock); 2128 list_for_each_entry(xattr, &info->xattr_list, list) { 2129 if (strcmp(name, xattr->name)) 2130 continue; 2131 2132 ret = xattr->size; 2133 if (buffer) { 2134 if (size < xattr->size) 2135 ret = -ERANGE; 2136 else 2137 memcpy(buffer, xattr->value, xattr->size); 2138 } 2139 break; 2140 } 2141 spin_unlock(&info->lock); 2142 return ret; 2143} 2144 2145static int shmem_xattr_set(struct dentry *dentry, const char *name, 2146 const void *value, size_t size, int flags) 2147{ 2148 struct inode *inode = dentry->d_inode; 2149 struct shmem_inode_info *info = SHMEM_I(inode); 2150 struct shmem_xattr *xattr; 2151 struct shmem_xattr *new_xattr = NULL; 2152 size_t len; 2153 int err = 0; 2154 2155 /* value == NULL means remove */ 2156 if (value) { 2157 /* wrap around? */ 2158 len = sizeof(*new_xattr) + size; 2159 if (len <= sizeof(*new_xattr)) 2160 return -ENOMEM; 2161 2162 new_xattr = kmalloc(len, GFP_KERNEL); 2163 if (!new_xattr) 2164 return -ENOMEM; 2165 2166 new_xattr->name = kstrdup(name, GFP_KERNEL); 2167 if (!new_xattr->name) { 2168 kfree(new_xattr); 2169 return -ENOMEM; 2170 } 2171 2172 new_xattr->size = size; 2173 memcpy(new_xattr->value, value, size); 2174 } 2175 2176 spin_lock(&info->lock); 2177 list_for_each_entry(xattr, &info->xattr_list, list) { 2178 if (!strcmp(name, xattr->name)) { 2179 if (flags & XATTR_CREATE) { 2180 xattr = new_xattr; 2181 err = -EEXIST; 2182 } else if (new_xattr) { 2183 list_replace(&xattr->list, &new_xattr->list); 2184 } else { 2185 list_del(&xattr->list); 2186 } 2187 goto out; 2188 } 2189 } 2190 if (flags & XATTR_REPLACE) { 2191 xattr = new_xattr; 2192 err = -ENODATA; 2193 } else { 2194 list_add(&new_xattr->list, &info->xattr_list); 2195 xattr = NULL; 2196 } 2197out: 2198 spin_unlock(&info->lock); 2199 if (xattr) 2200 kfree(xattr->name); 2201 kfree(xattr); 2202 return err; 2203} 2204 2205 2206static const struct xattr_handler *shmem_xattr_handlers[] = { 2207#ifdef CONFIG_TMPFS_POSIX_ACL 2208 &generic_acl_access_handler, 2209 &generic_acl_default_handler, 2210#endif 2211 NULL 2212}; 2213 2214static int shmem_xattr_validate(const char *name) 2215{ 2216 struct { const char *prefix; size_t len; } arr[] = { 2217 { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN }, 2218 { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN } 2219 }; 2220 int i; 2221 2222 for (i = 0; i < ARRAY_SIZE(arr); i++) { 2223 size_t preflen = arr[i].len; 2224 if (strncmp(name, arr[i].prefix, preflen) == 0) { 2225 if (!name[preflen]) 2226 return -EINVAL; 2227 return 0; 2228 } 2229 } 2230 return -EOPNOTSUPP; 2231} 2232 2233static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, 2234 void *buffer, size_t size) 2235{ 2236 int err; 2237 2238 /* 2239 * If this is a request for a synthetic attribute in the system.* 2240 * namespace use the generic infrastructure to resolve a handler 2241 * for it via sb->s_xattr. 2242 */ 2243 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 2244 return generic_getxattr(dentry, name, buffer, size); 2245 2246 err = shmem_xattr_validate(name); 2247 if (err) 2248 return err; 2249 2250 return shmem_xattr_get(dentry, name, buffer, size); 2251} 2252 2253static int shmem_setxattr(struct dentry *dentry, const char *name, 2254 const void *value, size_t size, int flags) 2255{ 2256 int err; 2257 2258 /* 2259 * If this is a request for a synthetic attribute in the system.* 2260 * namespace use the generic infrastructure to resolve a handler 2261 * for it via sb->s_xattr. 2262 */ 2263 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 2264 return generic_setxattr(dentry, name, value, size, flags); 2265 2266 err = shmem_xattr_validate(name); 2267 if (err) 2268 return err; 2269 2270 if (size == 0) 2271 value = ""; /* empty EA, do not remove */ 2272 2273 return shmem_xattr_set(dentry, name, value, size, flags); 2274 2275} 2276 2277static int shmem_removexattr(struct dentry *dentry, const char *name) 2278{ 2279 int err; 2280 2281 /* 2282 * If this is a request for a synthetic attribute in the system.* 2283 * namespace use the generic infrastructure to resolve a handler 2284 * for it via sb->s_xattr. 2285 */ 2286 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 2287 return generic_removexattr(dentry, name); 2288 2289 err = shmem_xattr_validate(name); 2290 if (err) 2291 return err; 2292 2293 return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE); 2294} 2295 2296static bool xattr_is_trusted(const char *name) 2297{ 2298 return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); 2299} 2300 2301static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) 2302{ 2303 bool trusted = capable(CAP_SYS_ADMIN); 2304 struct shmem_xattr *xattr; 2305 struct shmem_inode_info *info; 2306 size_t used = 0; 2307 2308 info = SHMEM_I(dentry->d_inode); 2309 2310 spin_lock(&info->lock); 2311 list_for_each_entry(xattr, &info->xattr_list, list) { 2312 size_t len; 2313 2314 /* skip "trusted." attributes for unprivileged callers */ 2315 if (!trusted && xattr_is_trusted(xattr->name)) 2316 continue; 2317 2318 len = strlen(xattr->name) + 1; 2319 used += len; 2320 if (buffer) { 2321 if (size < used) { 2322 used = -ERANGE; 2323 break; 2324 } 2325 memcpy(buffer, xattr->name, len); 2326 buffer += len; 2327 } 2328 } 2329 spin_unlock(&info->lock); 2330 2331 return used; 2332} 2333#endif /* CONFIG_TMPFS_XATTR */ 2334 2335static const struct inode_operations shmem_symlink_inline_operations = { 2336 .readlink = generic_readlink, 2337 .follow_link = shmem_follow_link_inline, 2338#ifdef CONFIG_TMPFS_XATTR 2339 .setxattr = shmem_setxattr, 2340 .getxattr = shmem_getxattr, 2341 .listxattr = shmem_listxattr, 2342 .removexattr = shmem_removexattr, 2343#endif 2344}; 2345 2346static const struct inode_operations shmem_symlink_inode_operations = { 2347 .readlink = generic_readlink, 2348 .follow_link = shmem_follow_link, 2349 .put_link = shmem_put_link, 2350#ifdef CONFIG_TMPFS_XATTR 2351 .setxattr = shmem_setxattr, 2352 .getxattr = shmem_getxattr, 2353 .listxattr = shmem_listxattr, 2354 .removexattr = shmem_removexattr, 2355#endif 2356}; 2357 2358static struct dentry *shmem_get_parent(struct dentry *child) 2359{ 2360 return ERR_PTR(-ESTALE); 2361} 2362 2363static int shmem_match(struct inode *ino, void *vfh) 2364{ 2365 __u32 *fh = vfh; 2366 __u64 inum = fh[2]; 2367 inum = (inum << 32) | fh[1]; 2368 return ino->i_ino == inum && fh[0] == ino->i_generation; 2369} 2370 2371static struct dentry *shmem_fh_to_dentry(struct super_block *sb, 2372 struct fid *fid, int fh_len, int fh_type) 2373{ 2374 struct inode *inode; 2375 struct dentry *dentry = NULL; 2376 u64 inum = fid->raw[2]; 2377 inum = (inum << 32) | fid->raw[1]; 2378 2379 if (fh_len < 3) 2380 return NULL; 2381 2382 inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), 2383 shmem_match, fid->raw); 2384 if (inode) { 2385 dentry = d_find_alias(inode); 2386 iput(inode); 2387 } 2388 2389 return dentry; 2390} 2391 2392static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, 2393 int connectable) 2394{ 2395 struct inode *inode = dentry->d_inode; 2396 2397 if (*len < 3) { 2398 *len = 3; 2399 return 255; 2400 } 2401 2402 if (inode_unhashed(inode)) { 2403 /* Unfortunately insert_inode_hash is not idempotent, 2404 * so as we hash inodes here rather than at creation 2405 * time, we need a lock to ensure we only try 2406 * to do it once 2407 */ 2408 static DEFINE_SPINLOCK(lock); 2409 spin_lock(&lock); 2410 if (inode_unhashed(inode)) 2411 __insert_inode_hash(inode, 2412 inode->i_ino + inode->i_generation); 2413 spin_unlock(&lock); 2414 } 2415 2416 fh[0] = inode->i_generation; 2417 fh[1] = inode->i_ino; 2418 fh[2] = ((__u64)inode->i_ino) >> 32; 2419 2420 *len = 3; 2421 return 1; 2422} 2423 2424static const struct export_operations shmem_export_ops = { 2425 .get_parent = shmem_get_parent, 2426 .encode_fh = shmem_encode_fh, 2427 .fh_to_dentry = shmem_fh_to_dentry, 2428}; 2429 2430static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, 2431 bool remount) 2432{ 2433 char *this_char, *value, *rest; 2434 2435 while (options != NULL) { 2436 this_char = options; 2437 for (;;) { 2438 /* 2439 * NUL-terminate this option: unfortunately, 2440 * mount options form a comma-separated list, 2441 * but mpol's nodelist may also contain commas. 2442 */ 2443 options = strchr(options, ','); 2444 if (options == NULL) 2445 break; 2446 options++; 2447 if (!isdigit(*options)) { 2448 options[-1] = '\0'; 2449 break; 2450 } 2451 } 2452 if (!*this_char) 2453 continue; 2454 if ((value = strchr(this_char,'=')) != NULL) { 2455 *value++ = 0; 2456 } else { 2457 printk(KERN_ERR 2458 "tmpfs: No value for mount option '%s'\n", 2459 this_char); 2460 return 1; 2461 } 2462 2463 if (!strcmp(this_char,"size")) { 2464 unsigned long long size; 2465 size = memparse(value,&rest); 2466 if (*rest == '%') { 2467 size <<= PAGE_SHIFT; 2468 size *= totalram_pages; 2469 do_div(size, 100); 2470 rest++; 2471 } 2472 if (*rest) 2473 goto bad_val; 2474 sbinfo->max_blocks = 2475 DIV_ROUND_UP(size, PAGE_CACHE_SIZE); 2476 } else if (!strcmp(this_char,"nr_blocks")) { 2477 sbinfo->max_blocks = memparse(value, &rest); 2478 if (*rest) 2479 goto bad_val; 2480 } else if (!strcmp(this_char,"nr_inodes")) { 2481 sbinfo->max_inodes = memparse(value, &rest); 2482 if (*rest) 2483 goto bad_val; 2484 } else if (!strcmp(this_char,"mode")) { 2485 if (remount) 2486 continue; 2487 sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777; 2488 if (*rest) 2489 goto bad_val; 2490 } else if (!strcmp(this_char,"uid")) { 2491 if (remount) 2492 continue; 2493 sbinfo->uid = simple_strtoul(value, &rest, 0); 2494 if (*rest) 2495 goto bad_val; 2496 } else if (!strcmp(this_char,"gid")) { 2497 if (remount) 2498 continue; 2499 sbinfo->gid = simple_strtoul(value, &rest, 0); 2500 if (*rest) 2501 goto bad_val; 2502 } else if (!strcmp(this_char,"mpol")) { 2503 if (mpol_parse_str(value, &sbinfo->mpol, 1)) 2504 goto bad_val; 2505 } else { 2506 printk(KERN_ERR "tmpfs: Bad mount option %s\n", 2507 this_char); 2508 return 1; 2509 } 2510 } 2511 return 0; 2512 2513bad_val: 2514 printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", 2515 value, this_char); 2516 return 1; 2517 2518} 2519 2520static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) 2521{ 2522 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2523 struct shmem_sb_info config = *sbinfo; 2524 unsigned long inodes; 2525 int error = -EINVAL; 2526 2527 if (shmem_parse_options(data, &config, true)) 2528 return error; 2529 2530 spin_lock(&sbinfo->stat_lock); 2531 inodes = sbinfo->max_inodes - sbinfo->free_inodes; 2532 if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0) 2533 goto out; 2534 if (config.max_inodes < inodes) 2535 goto out; 2536 /* 2537 * Those tests also disallow limited->unlimited while any are in 2538 * use, so i_blocks will always be zero when max_blocks is zero; 2539 * but we must separately disallow unlimited->limited, because 2540 * in that case we have no record of how much is already in use. 2541 */ 2542 if (config.max_blocks && !sbinfo->max_blocks) 2543 goto out; 2544 if (config.max_inodes && !sbinfo->max_inodes) 2545 goto out; 2546 2547 error = 0; 2548 sbinfo->max_blocks = config.max_blocks; 2549 sbinfo->max_inodes = config.max_inodes; 2550 sbinfo->free_inodes = config.max_inodes - inodes; 2551 2552 mpol_put(sbinfo->mpol); 2553 sbinfo->mpol = config.mpol; /* transfers initial ref */ 2554out: 2555 spin_unlock(&sbinfo->stat_lock); 2556 return error; 2557} 2558 2559static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs) 2560{ 2561 struct shmem_sb_info *sbinfo = SHMEM_SB(vfs->mnt_sb); 2562 2563 if (sbinfo->max_blocks != shmem_default_max_blocks()) 2564 seq_printf(seq, ",size=%luk", 2565 sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10)); 2566 if (sbinfo->max_inodes != shmem_default_max_inodes()) 2567 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); 2568 if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) 2569 seq_printf(seq, ",mode=%03o", sbinfo->mode); 2570 if (sbinfo->uid != 0) 2571 seq_printf(seq, ",uid=%u", sbinfo->uid); 2572 if (sbinfo->gid != 0) 2573 seq_printf(seq, ",gid=%u", sbinfo->gid); 2574 shmem_show_mpol(seq, sbinfo->mpol); 2575 return 0; 2576} 2577#endif /* CONFIG_TMPFS */ 2578 2579static void shmem_put_super(struct super_block *sb) 2580{ 2581 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2582 2583 percpu_counter_destroy(&sbinfo->used_blocks); 2584 kfree(sbinfo); 2585 sb->s_fs_info = NULL; 2586} 2587 2588int shmem_fill_super(struct super_block *sb, void *data, int silent) 2589{ 2590 struct inode *inode; 2591 struct dentry *root; 2592 struct shmem_sb_info *sbinfo; 2593 int err = -ENOMEM; 2594 2595 /* Round up to L1_CACHE_BYTES to resist false sharing */ 2596 sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), 2597 L1_CACHE_BYTES), GFP_KERNEL); 2598 if (!sbinfo) 2599 return -ENOMEM; 2600 2601 sbinfo->mode = S_IRWXUGO | S_ISVTX; 2602 sbinfo->uid = current_fsuid(); 2603 sbinfo->gid = current_fsgid(); 2604 sb->s_fs_info = sbinfo; 2605 2606#ifdef CONFIG_TMPFS 2607 /* 2608 * Per default we only allow half of the physical ram per 2609 * tmpfs instance, limiting inodes to one per page of lowmem; 2610 * but the internal instance is left unlimited. 2611 */ 2612 if (!(sb->s_flags & MS_NOUSER)) { 2613 sbinfo->max_blocks = shmem_default_max_blocks(); 2614 sbinfo->max_inodes = shmem_default_max_inodes(); 2615 if (shmem_parse_options(data, sbinfo, false)) { 2616 err = -EINVAL; 2617 goto failed; 2618 } 2619 } 2620 sb->s_export_op = &shmem_export_ops; 2621#else 2622 sb->s_flags |= MS_NOUSER; 2623#endif 2624 2625 spin_lock_init(&sbinfo->stat_lock); 2626 if (percpu_counter_init(&sbinfo->used_blocks, 0)) 2627 goto failed; 2628 sbinfo->free_inodes = sbinfo->max_inodes; 2629 2630 sb->s_maxbytes = SHMEM_MAX_BYTES; 2631 sb->s_blocksize = PAGE_CACHE_SIZE; 2632 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 2633 sb->s_magic = TMPFS_MAGIC; 2634 sb->s_op = &shmem_ops; 2635 sb->s_time_gran = 1; 2636#ifdef CONFIG_TMPFS_XATTR 2637 sb->s_xattr = shmem_xattr_handlers; 2638#endif 2639#ifdef CONFIG_TMPFS_POSIX_ACL 2640 sb->s_flags |= MS_POSIXACL; 2641#endif 2642 2643 inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); 2644 if (!inode) 2645 goto failed; 2646 inode->i_uid = sbinfo->uid; 2647 inode->i_gid = sbinfo->gid; 2648 root = d_alloc_root(inode); 2649 if (!root) 2650 goto failed_iput; 2651 sb->s_root = root; 2652 return 0; 2653 2654failed_iput: 2655 iput(inode); 2656failed: 2657 shmem_put_super(sb); 2658 return err; 2659} 2660 2661static struct kmem_cache *shmem_inode_cachep; 2662 2663static struct inode *shmem_alloc_inode(struct super_block *sb) 2664{ 2665 struct shmem_inode_info *p; 2666 p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); 2667 if (!p) 2668 return NULL; 2669 return &p->vfs_inode; 2670} 2671 2672static void shmem_i_callback(struct rcu_head *head) 2673{ 2674 struct inode *inode = container_of(head, struct inode, i_rcu); 2675 INIT_LIST_HEAD(&inode->i_dentry); 2676 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2677} 2678 2679static void shmem_destroy_inode(struct inode *inode) 2680{ 2681 if ((inode->i_mode & S_IFMT) == S_IFREG) { 2682 /* only struct inode is valid if it's an inline symlink */ 2683 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 2684 } 2685 call_rcu(&inode->i_rcu, shmem_i_callback); 2686} 2687 2688static void init_once(void *foo) 2689{ 2690 struct shmem_inode_info *p = (struct shmem_inode_info *) foo; 2691 2692 inode_init_once(&p->vfs_inode); 2693} 2694 2695static int init_inodecache(void) 2696{ 2697 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 2698 sizeof(struct shmem_inode_info), 2699 0, SLAB_PANIC, init_once); 2700 return 0; 2701} 2702 2703static void destroy_inodecache(void) 2704{ 2705 kmem_cache_destroy(shmem_inode_cachep); 2706} 2707 2708static const struct address_space_operations shmem_aops = { 2709 .writepage = shmem_writepage, 2710 .set_page_dirty = __set_page_dirty_no_writeback, 2711#ifdef CONFIG_TMPFS 2712 .write_begin = shmem_write_begin, 2713 .write_end = shmem_write_end, 2714#endif 2715 .migratepage = migrate_page, 2716 .error_remove_page = generic_error_remove_page, 2717}; 2718 2719static const struct file_operations shmem_file_operations = { 2720 .mmap = shmem_mmap, 2721#ifdef CONFIG_TMPFS 2722 .llseek = generic_file_llseek, 2723 .read = do_sync_read, 2724 .write = do_sync_write, 2725 .aio_read = shmem_file_aio_read, 2726 .aio_write = generic_file_aio_write, 2727 .fsync = noop_fsync, 2728 .splice_read = shmem_file_splice_read, 2729 .splice_write = generic_file_splice_write, 2730#endif 2731}; 2732 2733static const struct inode_operations shmem_inode_operations = { 2734 .setattr = shmem_setattr, 2735 .truncate_range = shmem_truncate_range, 2736#ifdef CONFIG_TMPFS_XATTR 2737 .setxattr = shmem_setxattr, 2738 .getxattr = shmem_getxattr, 2739 .listxattr = shmem_listxattr, 2740 .removexattr = shmem_removexattr, 2741#endif 2742#ifdef CONFIG_TMPFS_POSIX_ACL 2743 .check_acl = generic_check_acl, 2744#endif 2745 2746}; 2747 2748static const struct inode_operations shmem_dir_inode_operations = { 2749#ifdef CONFIG_TMPFS 2750 .create = shmem_create, 2751 .lookup = simple_lookup, 2752 .link = shmem_link, 2753 .unlink = shmem_unlink, 2754 .symlink = shmem_symlink, 2755 .mkdir = shmem_mkdir, 2756 .rmdir = shmem_rmdir, 2757 .mknod = shmem_mknod, 2758 .rename = shmem_rename, 2759#endif 2760#ifdef CONFIG_TMPFS_XATTR 2761 .setxattr = shmem_setxattr, 2762 .getxattr = shmem_getxattr, 2763 .listxattr = shmem_listxattr, 2764 .removexattr = shmem_removexattr, 2765#endif 2766#ifdef CONFIG_TMPFS_POSIX_ACL 2767 .setattr = shmem_setattr, 2768 .check_acl = generic_check_acl, 2769#endif 2770}; 2771 2772static const struct inode_operations shmem_special_inode_operations = { 2773#ifdef CONFIG_TMPFS_XATTR 2774 .setxattr = shmem_setxattr, 2775 .getxattr = shmem_getxattr, 2776 .listxattr = shmem_listxattr, 2777 .removexattr = shmem_removexattr, 2778#endif 2779#ifdef CONFIG_TMPFS_POSIX_ACL 2780 .setattr = shmem_setattr, 2781 .check_acl = generic_check_acl, 2782#endif 2783}; 2784 2785static const struct super_operations shmem_ops = { 2786 .alloc_inode = shmem_alloc_inode, 2787 .destroy_inode = shmem_destroy_inode, 2788#ifdef CONFIG_TMPFS 2789 .statfs = shmem_statfs, 2790 .remount_fs = shmem_remount_fs, 2791 .show_options = shmem_show_options, 2792#endif 2793 .evict_inode = shmem_evict_inode, 2794 .drop_inode = generic_delete_inode, 2795 .put_super = shmem_put_super, 2796}; 2797 2798static const struct vm_operations_struct shmem_vm_ops = { 2799 .fault = shmem_fault, 2800#ifdef CONFIG_NUMA 2801 .set_policy = shmem_set_policy, 2802 .get_policy = shmem_get_policy, 2803#endif 2804}; 2805 2806 2807static struct dentry *shmem_mount(struct file_system_type *fs_type, 2808 int flags, const char *dev_name, void *data) 2809{ 2810 return mount_nodev(fs_type, flags, data, shmem_fill_super); 2811} 2812 2813static struct file_system_type tmpfs_fs_type = { 2814 .owner = THIS_MODULE, 2815 .name = "tmpfs", 2816 .mount = shmem_mount, 2817 .kill_sb = kill_litter_super, 2818}; 2819 2820int __init init_tmpfs(void) 2821{ 2822 int error; 2823 2824 error = bdi_init(&shmem_backing_dev_info); 2825 if (error) 2826 goto out4; 2827 2828 error = init_inodecache(); 2829 if (error) 2830 goto out3; 2831 2832 error = register_filesystem(&tmpfs_fs_type); 2833 if (error) { 2834 printk(KERN_ERR "Could not register tmpfs\n"); 2835 goto out2; 2836 } 2837 2838 shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER, 2839 tmpfs_fs_type.name, NULL); 2840 if (IS_ERR(shm_mnt)) { 2841 error = PTR_ERR(shm_mnt); 2842 printk(KERN_ERR "Could not kern_mount tmpfs\n"); 2843 goto out1; 2844 } 2845 return 0; 2846 2847out1: 2848 unregister_filesystem(&tmpfs_fs_type); 2849out2: 2850 destroy_inodecache(); 2851out3: 2852 bdi_destroy(&shmem_backing_dev_info); 2853out4: 2854 shm_mnt = ERR_PTR(error); 2855 return error; 2856} 2857 2858#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2859/** 2860 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file 2861 * @inode: the inode to be searched 2862 * @pgoff: the offset to be searched 2863 * @pagep: the pointer for the found page to be stored 2864 * @ent: the pointer for the found swap entry to be stored 2865 * 2866 * If a page is found, refcount of it is incremented. Callers should handle 2867 * these refcount. 2868 */ 2869void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, 2870 struct page **pagep, swp_entry_t *ent) 2871{ 2872 swp_entry_t entry = { .val = 0 }, *ptr; 2873 struct page *page = NULL; 2874 struct shmem_inode_info *info = SHMEM_I(inode); 2875 2876 if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) 2877 goto out; 2878 2879 spin_lock(&info->lock); 2880 ptr = shmem_swp_entry(info, pgoff, NULL); 2881#ifdef CONFIG_SWAP 2882 if (ptr && ptr->val) { 2883 entry.val = ptr->val; 2884 page = find_get_page(&swapper_space, entry.val); 2885 } else 2886#endif 2887 page = find_get_page(inode->i_mapping, pgoff); 2888 if (ptr) 2889 shmem_swp_unmap(ptr); 2890 spin_unlock(&info->lock); 2891out: 2892 *pagep = page; 2893 *ent = entry; 2894} 2895#endif 2896 2897#else /* !CONFIG_SHMEM */ 2898 2899/* 2900 * tiny-shmem: simple shmemfs and tmpfs using ramfs code 2901 * 2902 * This is intended for small system where the benefits of the full 2903 * shmem code (swap-backed and resource-limited) are outweighed by 2904 * their complexity. On systems without swap this code should be 2905 * effectively equivalent, but much lighter weight. 2906 */ 2907 2908#include <linux/ramfs.h> 2909 2910static struct file_system_type tmpfs_fs_type = { 2911 .name = "tmpfs", 2912 .mount = ramfs_mount, 2913 .kill_sb = kill_litter_super, 2914}; 2915 2916int __init init_tmpfs(void) 2917{ 2918 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); 2919 2920 shm_mnt = kern_mount(&tmpfs_fs_type); 2921 BUG_ON(IS_ERR(shm_mnt)); 2922 2923 return 0; 2924} 2925 2926int shmem_unuse(swp_entry_t entry, struct page *page) 2927{ 2928 return 0; 2929} 2930 2931int shmem_lock(struct file *file, int lock, struct user_struct *user) 2932{ 2933 return 0; 2934} 2935 2936void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) 2937{ 2938 truncate_inode_pages_range(inode->i_mapping, start, end); 2939} 2940EXPORT_SYMBOL_GPL(shmem_truncate_range); 2941 2942#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2943/** 2944 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file 2945 * @inode: the inode to be searched 2946 * @pgoff: the offset to be searched 2947 * @pagep: the pointer for the found page to be stored 2948 * @ent: the pointer for the found swap entry to be stored 2949 * 2950 * If a page is found, refcount of it is incremented. Callers should handle 2951 * these refcount. 2952 */ 2953void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, 2954 struct page **pagep, swp_entry_t *ent) 2955{ 2956 struct page *page = NULL; 2957 2958 if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) 2959 goto out; 2960 page = find_get_page(inode->i_mapping, pgoff); 2961out: 2962 *pagep = page; 2963 *ent = (swp_entry_t){ .val = 0 }; 2964} 2965#endif 2966 2967#define shmem_vm_ops generic_file_vm_ops 2968#define shmem_file_operations ramfs_file_operations 2969#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) 2970#define shmem_acct_size(flags, size) 0 2971#define shmem_unacct_size(flags, size) do {} while (0) 2972#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE 2973 2974#endif /* CONFIG_SHMEM */ 2975 2976/* common code */ 2977 2978/** 2979 * shmem_file_setup - get an unlinked file living in tmpfs 2980 * @name: name for dentry (to be seen in /proc/<pid>/maps 2981 * @size: size to be set for the file 2982 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size 2983 */ 2984struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) 2985{ 2986 int error; 2987 struct file *file; 2988 struct inode *inode; 2989 struct path path; 2990 struct dentry *root; 2991 struct qstr this; 2992 2993 if (IS_ERR(shm_mnt)) 2994 return (void *)shm_mnt; 2995 2996 if (size < 0 || size > SHMEM_MAX_BYTES) 2997 return ERR_PTR(-EINVAL); 2998 2999 if (shmem_acct_size(flags, size)) 3000 return ERR_PTR(-ENOMEM); 3001 3002 error = -ENOMEM; 3003 this.name = name; 3004 this.len = strlen(name); 3005 this.hash = 0; /* will go */ 3006 root = shm_mnt->mnt_root; 3007 path.dentry = d_alloc(root, &this); 3008 if (!path.dentry) 3009 goto put_memory; 3010 path.mnt = mntget(shm_mnt); 3011 3012 error = -ENOSPC; 3013 inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); 3014 if (!inode) 3015 goto put_dentry; 3016 3017 d_instantiate(path.dentry, inode); 3018 inode->i_size = size; 3019 inode->i_nlink = 0; /* It is unlinked */ 3020#ifndef CONFIG_MMU 3021 error = ramfs_nommu_expand_for_mapping(inode, size); 3022 if (error) 3023 goto put_dentry; 3024#endif 3025 3026 error = -ENFILE; 3027 file = alloc_file(&path, FMODE_WRITE | FMODE_READ, 3028 &shmem_file_operations); 3029 if (!file) 3030 goto put_dentry; 3031 3032 return file; 3033 3034put_dentry: 3035 path_put(&path); 3036put_memory: 3037 shmem_unacct_size(flags, size); 3038 return ERR_PTR(error); 3039} 3040EXPORT_SYMBOL_GPL(shmem_file_setup); 3041 3042/** 3043 * shmem_zero_setup - setup a shared anonymous mapping 3044 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff 3045 */ 3046int shmem_zero_setup(struct vm_area_struct *vma) 3047{ 3048 struct file *file; 3049 loff_t size = vma->vm_end - vma->vm_start; 3050 3051 file = shmem_file_setup("dev/zero", size, vma->vm_flags); 3052 if (IS_ERR(file)) 3053 return PTR_ERR(file); 3054 3055 if (vma->vm_file) 3056 fput(vma->vm_file); 3057 vma->vm_file = file; 3058 vma->vm_ops = &shmem_vm_ops; 3059 vma->vm_flags |= VM_CAN_NONLINEAR; 3060 return 0; 3061} 3062 3063/** 3064 * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. 3065 * @mapping: the page's address_space 3066 * @index: the page index 3067 * @gfp: the page allocator flags to use if allocating 3068 * 3069 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", 3070 * with any new page allocations done using the specified allocation flags. 3071 * But read_cache_page_gfp() uses the ->readpage() method: which does not 3072 * suit tmpfs, since it may have pages in swapcache, and needs to find those 3073 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. 3074 * 3075 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in 3076 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. 3077 */ 3078struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 3079 pgoff_t index, gfp_t gfp) 3080{ 3081#ifdef CONFIG_SHMEM 3082 struct inode *inode = mapping->host; 3083 struct page *page; 3084 int error; 3085 3086 BUG_ON(mapping->a_ops != &shmem_aops); 3087 error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL); 3088 if (error) 3089 page = ERR_PTR(error); 3090 else 3091 unlock_page(page); 3092 return page; 3093#else 3094 /* 3095 * The tiny !SHMEM case uses ramfs without swap 3096 */ 3097 return read_cache_page_gfp(mapping, index, gfp); 3098#endif 3099} 3100EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); 3101