shmem.c revision 285b2c4fdd69ea73b4762785d8c6be83b6c074a6
1/*
2 * Resizable virtual memory filesystem for Linux.
3 *
4 * Copyright (C) 2000 Linus Torvalds.
5 *		 2000 Transmeta Corp.
6 *		 2000-2001 Christoph Rohland
7 *		 2000-2001 SAP AG
8 *		 2002 Red Hat Inc.
9 * Copyright (C) 2002-2005 Hugh Dickins.
10 * Copyright (C) 2002-2005 VERITAS Software Corporation.
11 * Copyright (C) 2004 Andi Kleen, SuSE Labs
12 *
13 * Extended attribute support for tmpfs:
14 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
15 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
16 *
17 * tiny-shmem:
18 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
19 *
20 * This file is released under the GPL.
21 */
22
23#include <linux/fs.h>
24#include <linux/init.h>
25#include <linux/vfs.h>
26#include <linux/mount.h>
27#include <linux/pagemap.h>
28#include <linux/file.h>
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/percpu_counter.h>
32#include <linux/swap.h>
33
34static struct vfsmount *shm_mnt;
35
36#ifdef CONFIG_SHMEM
37/*
38 * This virtual memory filesystem is heavily based on the ramfs. It
39 * extends ramfs by the ability to use swap and honor resource limits
40 * which makes it a completely usable filesystem.
41 */
42
43#include <linux/xattr.h>
44#include <linux/exportfs.h>
45#include <linux/posix_acl.h>
46#include <linux/generic_acl.h>
47#include <linux/mman.h>
48#include <linux/string.h>
49#include <linux/slab.h>
50#include <linux/backing-dev.h>
51#include <linux/shmem_fs.h>
52#include <linux/writeback.h>
53#include <linux/blkdev.h>
54#include <linux/splice.h>
55#include <linux/security.h>
56#include <linux/swapops.h>
57#include <linux/mempolicy.h>
58#include <linux/namei.h>
59#include <linux/ctype.h>
60#include <linux/migrate.h>
61#include <linux/highmem.h>
62#include <linux/seq_file.h>
63#include <linux/magic.h>
64
65#include <asm/uaccess.h>
66#include <asm/div64.h>
67#include <asm/pgtable.h>
68
69#define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
70#define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
71
72/* Pretend that each entry is of this size in directory's i_size */
73#define BOGO_DIRENT_SIZE 20
74
75struct shmem_xattr {
76	struct list_head list;	/* anchored by shmem_inode_info->xattr_list */
77	char *name;		/* xattr name */
78	size_t size;
79	char value[0];
80};
81
82/* Flag allocation requirements to shmem_getpage */
83enum sgp_type {
84	SGP_READ,	/* don't exceed i_size, don't allocate page */
85	SGP_CACHE,	/* don't exceed i_size, may allocate page */
86	SGP_DIRTY,	/* like SGP_CACHE, but set new page dirty */
87	SGP_WRITE,	/* may exceed i_size, may allocate page */
88};
89
90#ifdef CONFIG_TMPFS
91static unsigned long shmem_default_max_blocks(void)
92{
93	return totalram_pages / 2;
94}
95
96static unsigned long shmem_default_max_inodes(void)
97{
98	return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
99}
100#endif
101
102static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
103	struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
104
105static inline int shmem_getpage(struct inode *inode, pgoff_t index,
106	struct page **pagep, enum sgp_type sgp, int *fault_type)
107{
108	return shmem_getpage_gfp(inode, index, pagep, sgp,
109			mapping_gfp_mask(inode->i_mapping), fault_type);
110}
111
112static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
113{
114	return sb->s_fs_info;
115}
116
117/*
118 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
119 * for shared memory and for shared anonymous (/dev/zero) mappings
120 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
121 * consistent with the pre-accounting of private mappings ...
122 */
123static inline int shmem_acct_size(unsigned long flags, loff_t size)
124{
125	return (flags & VM_NORESERVE) ?
126		0 : security_vm_enough_memory_kern(VM_ACCT(size));
127}
128
129static inline void shmem_unacct_size(unsigned long flags, loff_t size)
130{
131	if (!(flags & VM_NORESERVE))
132		vm_unacct_memory(VM_ACCT(size));
133}
134
135/*
136 * ... whereas tmpfs objects are accounted incrementally as
137 * pages are allocated, in order to allow huge sparse files.
138 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
139 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
140 */
141static inline int shmem_acct_block(unsigned long flags)
142{
143	return (flags & VM_NORESERVE) ?
144		security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)) : 0;
145}
146
147static inline void shmem_unacct_blocks(unsigned long flags, long pages)
148{
149	if (flags & VM_NORESERVE)
150		vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
151}
152
153static const struct super_operations shmem_ops;
154static const struct address_space_operations shmem_aops;
155static const struct file_operations shmem_file_operations;
156static const struct inode_operations shmem_inode_operations;
157static const struct inode_operations shmem_dir_inode_operations;
158static const struct inode_operations shmem_special_inode_operations;
159static const struct vm_operations_struct shmem_vm_ops;
160
161static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
162	.ra_pages	= 0,	/* No readahead */
163	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
164};
165
166static LIST_HEAD(shmem_swaplist);
167static DEFINE_MUTEX(shmem_swaplist_mutex);
168
169static void shmem_free_blocks(struct inode *inode, long pages)
170{
171	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
172	if (sbinfo->max_blocks) {
173		percpu_counter_add(&sbinfo->used_blocks, -pages);
174		inode->i_blocks -= pages*BLOCKS_PER_PAGE;
175	}
176}
177
178static int shmem_reserve_inode(struct super_block *sb)
179{
180	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
181	if (sbinfo->max_inodes) {
182		spin_lock(&sbinfo->stat_lock);
183		if (!sbinfo->free_inodes) {
184			spin_unlock(&sbinfo->stat_lock);
185			return -ENOSPC;
186		}
187		sbinfo->free_inodes--;
188		spin_unlock(&sbinfo->stat_lock);
189	}
190	return 0;
191}
192
193static void shmem_free_inode(struct super_block *sb)
194{
195	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
196	if (sbinfo->max_inodes) {
197		spin_lock(&sbinfo->stat_lock);
198		sbinfo->free_inodes++;
199		spin_unlock(&sbinfo->stat_lock);
200	}
201}
202
203/**
204 * shmem_recalc_inode - recalculate the size of an inode
205 * @inode: inode to recalc
206 *
207 * We have to calculate the free blocks since the mm can drop
208 * undirtied hole pages behind our back.
209 *
210 * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
211 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
212 *
213 * It has to be called with the spinlock held.
214 */
215static void shmem_recalc_inode(struct inode *inode)
216{
217	struct shmem_inode_info *info = SHMEM_I(inode);
218	long freed;
219
220	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
221	if (freed > 0) {
222		info->alloced -= freed;
223		shmem_unacct_blocks(info->flags, freed);
224		shmem_free_blocks(inode, freed);
225	}
226}
227
228static void shmem_put_swap(struct shmem_inode_info *info, pgoff_t index,
229			   swp_entry_t swap)
230{
231	if (index < SHMEM_NR_DIRECT)
232		info->i_direct[index] = swap;
233}
234
235static swp_entry_t shmem_get_swap(struct shmem_inode_info *info, pgoff_t index)
236{
237	return (index < SHMEM_NR_DIRECT) ?
238		info->i_direct[index] : (swp_entry_t){0};
239}
240
241void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
242{
243	struct address_space *mapping = inode->i_mapping;
244	struct shmem_inode_info *info = SHMEM_I(inode);
245	pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
246	pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
247	pgoff_t index;
248	swp_entry_t swap;
249
250	truncate_inode_pages_range(mapping, lstart, lend);
251
252	if (end > SHMEM_NR_DIRECT)
253		end = SHMEM_NR_DIRECT;
254
255	spin_lock(&info->lock);
256	for (index = start; index < end; index++) {
257		swap = shmem_get_swap(info, index);
258		if (swap.val) {
259			free_swap_and_cache(swap);
260			shmem_put_swap(info, index, (swp_entry_t){0});
261			info->swapped--;
262		}
263	}
264
265	if (mapping->nrpages) {
266		spin_unlock(&info->lock);
267		/*
268		 * A page may have meanwhile sneaked in from swap.
269		 */
270		truncate_inode_pages_range(mapping, lstart, lend);
271		spin_lock(&info->lock);
272	}
273
274	shmem_recalc_inode(inode);
275	spin_unlock(&info->lock);
276
277	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
278}
279EXPORT_SYMBOL_GPL(shmem_truncate_range);
280
281static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
282{
283	struct inode *inode = dentry->d_inode;
284	int error;
285
286	error = inode_change_ok(inode, attr);
287	if (error)
288		return error;
289
290	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
291		loff_t oldsize = inode->i_size;
292		loff_t newsize = attr->ia_size;
293		struct page *page = NULL;
294
295		if (newsize < oldsize) {
296			/*
297			 * If truncating down to a partial page, then
298			 * if that page is already allocated, hold it
299			 * in memory until the truncation is over, so
300			 * truncate_partial_page cannot miss it were
301			 * it assigned to swap.
302			 */
303			if (newsize & (PAGE_CACHE_SIZE-1)) {
304				(void) shmem_getpage(inode,
305					newsize >> PAGE_CACHE_SHIFT,
306						&page, SGP_READ, NULL);
307				if (page)
308					unlock_page(page);
309			}
310		}
311		if (newsize != oldsize) {
312			i_size_write(inode, newsize);
313			inode->i_ctime = inode->i_mtime = CURRENT_TIME;
314		}
315		if (newsize < oldsize) {
316			loff_t holebegin = round_up(newsize, PAGE_SIZE);
317			unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
318			shmem_truncate_range(inode, newsize, (loff_t)-1);
319			/* unmap again to remove racily COWed private pages */
320			unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
321		}
322		if (page)
323			page_cache_release(page);
324	}
325
326	setattr_copy(inode, attr);
327#ifdef CONFIG_TMPFS_POSIX_ACL
328	if (attr->ia_valid & ATTR_MODE)
329		error = generic_acl_chmod(inode);
330#endif
331	return error;
332}
333
334static void shmem_evict_inode(struct inode *inode)
335{
336	struct shmem_inode_info *info = SHMEM_I(inode);
337	struct shmem_xattr *xattr, *nxattr;
338
339	if (inode->i_mapping->a_ops == &shmem_aops) {
340		shmem_unacct_size(info->flags, inode->i_size);
341		inode->i_size = 0;
342		shmem_truncate_range(inode, 0, (loff_t)-1);
343		if (!list_empty(&info->swaplist)) {
344			mutex_lock(&shmem_swaplist_mutex);
345			list_del_init(&info->swaplist);
346			mutex_unlock(&shmem_swaplist_mutex);
347		}
348	}
349
350	list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
351		kfree(xattr->name);
352		kfree(xattr);
353	}
354	BUG_ON(inode->i_blocks);
355	shmem_free_inode(inode->i_sb);
356	end_writeback(inode);
357}
358
359static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
360{
361	struct address_space *mapping = info->vfs_inode.i_mapping;
362	unsigned long idx;
363	int error;
364
365	for (idx = 0; idx < SHMEM_NR_DIRECT; idx++)
366		if (shmem_get_swap(info, idx).val == entry.val)
367			goto found;
368	return 0;
369found:
370	spin_lock(&info->lock);
371	if (shmem_get_swap(info, idx).val != entry.val) {
372		spin_unlock(&info->lock);
373		return 0;
374	}
375
376	/*
377	 * Move _head_ to start search for next from here.
378	 * But be careful: shmem_evict_inode checks list_empty without taking
379	 * mutex, and there's an instant in list_move_tail when info->swaplist
380	 * would appear empty, if it were the only one on shmem_swaplist.
381	 */
382	if (shmem_swaplist.next != &info->swaplist)
383		list_move_tail(&shmem_swaplist, &info->swaplist);
384
385	/*
386	 * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
387	 * but also to hold up shmem_evict_inode(): so inode cannot be freed
388	 * beneath us (pagelock doesn't help until the page is in pagecache).
389	 */
390	error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
391	/* which does mem_cgroup_uncharge_cache_page on error */
392
393	if (error != -ENOMEM) {
394		delete_from_swap_cache(page);
395		set_page_dirty(page);
396		shmem_put_swap(info, idx, (swp_entry_t){0});
397		info->swapped--;
398		swap_free(entry);
399		error = 1;	/* not an error, but entry was found */
400	}
401	spin_unlock(&info->lock);
402	return error;
403}
404
405/*
406 * shmem_unuse() search for an eventually swapped out shmem page.
407 */
408int shmem_unuse(swp_entry_t entry, struct page *page)
409{
410	struct list_head *p, *next;
411	struct shmem_inode_info *info;
412	int found = 0;
413	int error;
414
415	/*
416	 * Charge page using GFP_KERNEL while we can wait, before taking
417	 * the shmem_swaplist_mutex which might hold up shmem_writepage().
418	 * Charged back to the user (not to caller) when swap account is used.
419	 * add_to_page_cache() will be called with GFP_NOWAIT.
420	 */
421	error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
422	if (error)
423		goto out;
424	/*
425	 * Try to preload while we can wait, to not make a habit of
426	 * draining atomic reserves; but don't latch on to this cpu,
427	 * it's okay if sometimes we get rescheduled after this.
428	 */
429	error = radix_tree_preload(GFP_KERNEL);
430	if (error)
431		goto uncharge;
432	radix_tree_preload_end();
433
434	mutex_lock(&shmem_swaplist_mutex);
435	list_for_each_safe(p, next, &shmem_swaplist) {
436		info = list_entry(p, struct shmem_inode_info, swaplist);
437		if (!info->swapped) {
438			spin_lock(&info->lock);
439			if (!info->swapped)
440				list_del_init(&info->swaplist);
441			spin_unlock(&info->lock);
442		}
443		if (info->swapped)
444			found = shmem_unuse_inode(info, entry, page);
445		cond_resched();
446		if (found)
447			break;
448	}
449	mutex_unlock(&shmem_swaplist_mutex);
450
451uncharge:
452	if (!found)
453		mem_cgroup_uncharge_cache_page(page);
454	if (found < 0)
455		error = found;
456out:
457	unlock_page(page);
458	page_cache_release(page);
459	return error;
460}
461
462/*
463 * Move the page from the page cache to the swap cache.
464 */
465static int shmem_writepage(struct page *page, struct writeback_control *wbc)
466{
467	struct shmem_inode_info *info;
468	swp_entry_t swap, oswap;
469	struct address_space *mapping;
470	unsigned long index;
471	struct inode *inode;
472
473	BUG_ON(!PageLocked(page));
474	mapping = page->mapping;
475	index = page->index;
476	inode = mapping->host;
477	info = SHMEM_I(inode);
478	if (info->flags & VM_LOCKED)
479		goto redirty;
480	if (!total_swap_pages)
481		goto redirty;
482
483	/*
484	 * shmem_backing_dev_info's capabilities prevent regular writeback or
485	 * sync from ever calling shmem_writepage; but a stacking filesystem
486	 * might use ->writepage of its underlying filesystem, in which case
487	 * tmpfs should write out to swap only in response to memory pressure,
488	 * and not for the writeback threads or sync.
489	 */
490	if (!wbc->for_reclaim) {
491		WARN_ON_ONCE(1);	/* Still happens? Tell us about it! */
492		goto redirty;
493	}
494
495	/*
496	 * Just for this patch, we have a toy implementation,
497	 * which can swap out only the first SHMEM_NR_DIRECT pages:
498	 * for simple demonstration of where we need to think about swap.
499	 */
500	if (index >= SHMEM_NR_DIRECT)
501		goto redirty;
502
503	swap = get_swap_page();
504	if (!swap.val)
505		goto redirty;
506
507	/*
508	 * Add inode to shmem_unuse()'s list of swapped-out inodes,
509	 * if it's not already there.  Do it now because we cannot take
510	 * mutex while holding spinlock, and must do so before the page
511	 * is moved to swap cache, when its pagelock no longer protects
512	 * the inode from eviction.  But don't unlock the mutex until
513	 * we've taken the spinlock, because shmem_unuse_inode() will
514	 * prune a !swapped inode from the swaplist under both locks.
515	 */
516	mutex_lock(&shmem_swaplist_mutex);
517	if (list_empty(&info->swaplist))
518		list_add_tail(&info->swaplist, &shmem_swaplist);
519
520	spin_lock(&info->lock);
521	mutex_unlock(&shmem_swaplist_mutex);
522
523	oswap = shmem_get_swap(info, index);
524	if (oswap.val) {
525		WARN_ON_ONCE(1);	/* Still happens? Tell us about it! */
526		free_swap_and_cache(oswap);
527		shmem_put_swap(info, index, (swp_entry_t){0});
528		info->swapped--;
529	}
530	shmem_recalc_inode(inode);
531
532	if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
533		delete_from_page_cache(page);
534		shmem_put_swap(info, index, swap);
535		info->swapped++;
536		swap_shmem_alloc(swap);
537		spin_unlock(&info->lock);
538		BUG_ON(page_mapped(page));
539		swap_writepage(page, wbc);
540		return 0;
541	}
542
543	spin_unlock(&info->lock);
544	swapcache_free(swap, NULL);
545redirty:
546	set_page_dirty(page);
547	if (wbc->for_reclaim)
548		return AOP_WRITEPAGE_ACTIVATE;	/* Return with page locked */
549	unlock_page(page);
550	return 0;
551}
552
553#ifdef CONFIG_NUMA
554#ifdef CONFIG_TMPFS
555static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
556{
557	char buffer[64];
558
559	if (!mpol || mpol->mode == MPOL_DEFAULT)
560		return;		/* show nothing */
561
562	mpol_to_str(buffer, sizeof(buffer), mpol, 1);
563
564	seq_printf(seq, ",mpol=%s", buffer);
565}
566
567static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
568{
569	struct mempolicy *mpol = NULL;
570	if (sbinfo->mpol) {
571		spin_lock(&sbinfo->stat_lock);	/* prevent replace/use races */
572		mpol = sbinfo->mpol;
573		mpol_get(mpol);
574		spin_unlock(&sbinfo->stat_lock);
575	}
576	return mpol;
577}
578#endif /* CONFIG_TMPFS */
579
580static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
581			struct shmem_inode_info *info, unsigned long idx)
582{
583	struct mempolicy mpol, *spol;
584	struct vm_area_struct pvma;
585	struct page *page;
586
587	spol = mpol_cond_copy(&mpol,
588				mpol_shared_policy_lookup(&info->policy, idx));
589
590	/* Create a pseudo vma that just contains the policy */
591	pvma.vm_start = 0;
592	pvma.vm_pgoff = idx;
593	pvma.vm_ops = NULL;
594	pvma.vm_policy = spol;
595	page = swapin_readahead(entry, gfp, &pvma, 0);
596	return page;
597}
598
599static struct page *shmem_alloc_page(gfp_t gfp,
600			struct shmem_inode_info *info, unsigned long idx)
601{
602	struct vm_area_struct pvma;
603
604	/* Create a pseudo vma that just contains the policy */
605	pvma.vm_start = 0;
606	pvma.vm_pgoff = idx;
607	pvma.vm_ops = NULL;
608	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
609
610	/*
611	 * alloc_page_vma() will drop the shared policy reference
612	 */
613	return alloc_page_vma(gfp, &pvma, 0);
614}
615#else /* !CONFIG_NUMA */
616#ifdef CONFIG_TMPFS
617static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p)
618{
619}
620#endif /* CONFIG_TMPFS */
621
622static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
623			struct shmem_inode_info *info, unsigned long idx)
624{
625	return swapin_readahead(entry, gfp, NULL, 0);
626}
627
628static inline struct page *shmem_alloc_page(gfp_t gfp,
629			struct shmem_inode_info *info, unsigned long idx)
630{
631	return alloc_page(gfp);
632}
633#endif /* CONFIG_NUMA */
634
635#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS)
636static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
637{
638	return NULL;
639}
640#endif
641
642/*
643 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
644 *
645 * If we allocate a new one we do not mark it dirty. That's up to the
646 * vm. If we swap it in we mark it dirty since we also free the swap
647 * entry since a page cannot live in both the swap and page cache
648 */
649static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx,
650	struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
651{
652	struct address_space *mapping = inode->i_mapping;
653	struct shmem_inode_info *info = SHMEM_I(inode);
654	struct shmem_sb_info *sbinfo;
655	struct page *page;
656	struct page *prealloc_page = NULL;
657	swp_entry_t swap;
658	int error;
659
660	if (idx > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
661		return -EFBIG;
662repeat:
663	page = find_lock_page(mapping, idx);
664	if (page) {
665		/*
666		 * Once we can get the page lock, it must be uptodate:
667		 * if there were an error in reading back from swap,
668		 * the page would not be inserted into the filecache.
669		 */
670		BUG_ON(!PageUptodate(page));
671		goto done;
672	}
673
674	/*
675	 * Try to preload while we can wait, to not make a habit of
676	 * draining atomic reserves; but don't latch on to this cpu.
677	 */
678	error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
679	if (error)
680		goto out;
681	radix_tree_preload_end();
682
683	if (sgp != SGP_READ && !prealloc_page) {
684		prealloc_page = shmem_alloc_page(gfp, info, idx);
685		if (prealloc_page) {
686			SetPageSwapBacked(prealloc_page);
687			if (mem_cgroup_cache_charge(prealloc_page,
688					current->mm, GFP_KERNEL)) {
689				page_cache_release(prealloc_page);
690				prealloc_page = NULL;
691			}
692		}
693	}
694
695	spin_lock(&info->lock);
696	shmem_recalc_inode(inode);
697	swap = shmem_get_swap(info, idx);
698	if (swap.val) {
699		/* Look it up and read it in.. */
700		page = lookup_swap_cache(swap);
701		if (!page) {
702			spin_unlock(&info->lock);
703			/* here we actually do the io */
704			if (fault_type)
705				*fault_type |= VM_FAULT_MAJOR;
706			page = shmem_swapin(swap, gfp, info, idx);
707			if (!page) {
708				swp_entry_t nswap = shmem_get_swap(info, idx);
709				if (nswap.val == swap.val) {
710					error = -ENOMEM;
711					goto out;
712				}
713				goto repeat;
714			}
715			wait_on_page_locked(page);
716			page_cache_release(page);
717			goto repeat;
718		}
719
720		/* We have to do this with page locked to prevent races */
721		if (!trylock_page(page)) {
722			spin_unlock(&info->lock);
723			wait_on_page_locked(page);
724			page_cache_release(page);
725			goto repeat;
726		}
727		if (PageWriteback(page)) {
728			spin_unlock(&info->lock);
729			wait_on_page_writeback(page);
730			unlock_page(page);
731			page_cache_release(page);
732			goto repeat;
733		}
734		if (!PageUptodate(page)) {
735			spin_unlock(&info->lock);
736			unlock_page(page);
737			page_cache_release(page);
738			error = -EIO;
739			goto out;
740		}
741
742		error = add_to_page_cache_locked(page, mapping,
743						 idx, GFP_NOWAIT);
744		if (error) {
745			spin_unlock(&info->lock);
746			if (error == -ENOMEM) {
747				/*
748				 * reclaim from proper memory cgroup and
749				 * call memcg's OOM if needed.
750				 */
751				error = mem_cgroup_shmem_charge_fallback(
752						page, current->mm, gfp);
753				if (error) {
754					unlock_page(page);
755					page_cache_release(page);
756					goto out;
757				}
758			}
759			unlock_page(page);
760			page_cache_release(page);
761			goto repeat;
762		}
763
764		delete_from_swap_cache(page);
765		shmem_put_swap(info, idx, (swp_entry_t){0});
766		info->swapped--;
767		spin_unlock(&info->lock);
768		set_page_dirty(page);
769		swap_free(swap);
770
771	} else if (sgp == SGP_READ) {
772		page = find_get_page(mapping, idx);
773		if (page && !trylock_page(page)) {
774			spin_unlock(&info->lock);
775			wait_on_page_locked(page);
776			page_cache_release(page);
777			goto repeat;
778		}
779		spin_unlock(&info->lock);
780
781	} else if (prealloc_page) {
782		sbinfo = SHMEM_SB(inode->i_sb);
783		if (sbinfo->max_blocks) {
784			if (percpu_counter_compare(&sbinfo->used_blocks,
785						sbinfo->max_blocks) >= 0 ||
786			    shmem_acct_block(info->flags))
787				goto nospace;
788			percpu_counter_inc(&sbinfo->used_blocks);
789			inode->i_blocks += BLOCKS_PER_PAGE;
790		} else if (shmem_acct_block(info->flags))
791			goto nospace;
792
793		page = prealloc_page;
794		prealloc_page = NULL;
795
796		swap = shmem_get_swap(info, idx);
797		if (swap.val)
798			mem_cgroup_uncharge_cache_page(page);
799		else
800			error = add_to_page_cache_lru(page, mapping,
801						idx, GFP_NOWAIT);
802		/*
803		 * At add_to_page_cache_lru() failure,
804		 * uncharge will be done automatically.
805		 */
806		if (swap.val || error) {
807			shmem_unacct_blocks(info->flags, 1);
808			shmem_free_blocks(inode, 1);
809			spin_unlock(&info->lock);
810			page_cache_release(page);
811			goto repeat;
812		}
813
814		info->alloced++;
815		spin_unlock(&info->lock);
816		clear_highpage(page);
817		flush_dcache_page(page);
818		SetPageUptodate(page);
819		if (sgp == SGP_DIRTY)
820			set_page_dirty(page);
821
822	} else {
823		spin_unlock(&info->lock);
824		error = -ENOMEM;
825		goto out;
826	}
827done:
828	*pagep = page;
829	error = 0;
830out:
831	if (prealloc_page) {
832		mem_cgroup_uncharge_cache_page(prealloc_page);
833		page_cache_release(prealloc_page);
834	}
835	return error;
836
837nospace:
838	/*
839	 * Perhaps the page was brought in from swap between find_lock_page
840	 * and taking info->lock?  We allow for that at add_to_page_cache_lru,
841	 * but must also avoid reporting a spurious ENOSPC while working on a
842	 * full tmpfs.
843	 */
844	page = find_get_page(mapping, idx);
845	spin_unlock(&info->lock);
846	if (page) {
847		page_cache_release(page);
848		goto repeat;
849	}
850	error = -ENOSPC;
851	goto out;
852}
853
854static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
855{
856	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
857	int error;
858	int ret = VM_FAULT_LOCKED;
859
860	if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
861		return VM_FAULT_SIGBUS;
862
863	error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
864	if (error)
865		return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
866
867	if (ret & VM_FAULT_MAJOR) {
868		count_vm_event(PGMAJFAULT);
869		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
870	}
871	return ret;
872}
873
874#ifdef CONFIG_NUMA
875static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
876{
877	struct inode *i = vma->vm_file->f_path.dentry->d_inode;
878	return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
879}
880
881static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
882					  unsigned long addr)
883{
884	struct inode *i = vma->vm_file->f_path.dentry->d_inode;
885	unsigned long idx;
886
887	idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
888	return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
889}
890#endif
891
892int shmem_lock(struct file *file, int lock, struct user_struct *user)
893{
894	struct inode *inode = file->f_path.dentry->d_inode;
895	struct shmem_inode_info *info = SHMEM_I(inode);
896	int retval = -ENOMEM;
897
898	spin_lock(&info->lock);
899	if (lock && !(info->flags & VM_LOCKED)) {
900		if (!user_shm_lock(inode->i_size, user))
901			goto out_nomem;
902		info->flags |= VM_LOCKED;
903		mapping_set_unevictable(file->f_mapping);
904	}
905	if (!lock && (info->flags & VM_LOCKED) && user) {
906		user_shm_unlock(inode->i_size, user);
907		info->flags &= ~VM_LOCKED;
908		mapping_clear_unevictable(file->f_mapping);
909		scan_mapping_unevictable_pages(file->f_mapping);
910	}
911	retval = 0;
912
913out_nomem:
914	spin_unlock(&info->lock);
915	return retval;
916}
917
918static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
919{
920	file_accessed(file);
921	vma->vm_ops = &shmem_vm_ops;
922	vma->vm_flags |= VM_CAN_NONLINEAR;
923	return 0;
924}
925
926static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
927				     int mode, dev_t dev, unsigned long flags)
928{
929	struct inode *inode;
930	struct shmem_inode_info *info;
931	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
932
933	if (shmem_reserve_inode(sb))
934		return NULL;
935
936	inode = new_inode(sb);
937	if (inode) {
938		inode->i_ino = get_next_ino();
939		inode_init_owner(inode, dir, mode);
940		inode->i_blocks = 0;
941		inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
942		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
943		inode->i_generation = get_seconds();
944		info = SHMEM_I(inode);
945		memset(info, 0, (char *)inode - (char *)info);
946		spin_lock_init(&info->lock);
947		info->flags = flags & VM_NORESERVE;
948		INIT_LIST_HEAD(&info->swaplist);
949		INIT_LIST_HEAD(&info->xattr_list);
950		cache_no_acl(inode);
951
952		switch (mode & S_IFMT) {
953		default:
954			inode->i_op = &shmem_special_inode_operations;
955			init_special_inode(inode, mode, dev);
956			break;
957		case S_IFREG:
958			inode->i_mapping->a_ops = &shmem_aops;
959			inode->i_op = &shmem_inode_operations;
960			inode->i_fop = &shmem_file_operations;
961			mpol_shared_policy_init(&info->policy,
962						 shmem_get_sbmpol(sbinfo));
963			break;
964		case S_IFDIR:
965			inc_nlink(inode);
966			/* Some things misbehave if size == 0 on a directory */
967			inode->i_size = 2 * BOGO_DIRENT_SIZE;
968			inode->i_op = &shmem_dir_inode_operations;
969			inode->i_fop = &simple_dir_operations;
970			break;
971		case S_IFLNK:
972			/*
973			 * Must not load anything in the rbtree,
974			 * mpol_free_shared_policy will not be called.
975			 */
976			mpol_shared_policy_init(&info->policy, NULL);
977			break;
978		}
979	} else
980		shmem_free_inode(sb);
981	return inode;
982}
983
984#ifdef CONFIG_TMPFS
985static const struct inode_operations shmem_symlink_inode_operations;
986static const struct inode_operations shmem_symlink_inline_operations;
987
988static int
989shmem_write_begin(struct file *file, struct address_space *mapping,
990			loff_t pos, unsigned len, unsigned flags,
991			struct page **pagep, void **fsdata)
992{
993	struct inode *inode = mapping->host;
994	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
995	return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
996}
997
998static int
999shmem_write_end(struct file *file, struct address_space *mapping,
1000			loff_t pos, unsigned len, unsigned copied,
1001			struct page *page, void *fsdata)
1002{
1003	struct inode *inode = mapping->host;
1004
1005	if (pos + copied > inode->i_size)
1006		i_size_write(inode, pos + copied);
1007
1008	set_page_dirty(page);
1009	unlock_page(page);
1010	page_cache_release(page);
1011
1012	return copied;
1013}
1014
1015static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
1016{
1017	struct inode *inode = filp->f_path.dentry->d_inode;
1018	struct address_space *mapping = inode->i_mapping;
1019	unsigned long index, offset;
1020	enum sgp_type sgp = SGP_READ;
1021
1022	/*
1023	 * Might this read be for a stacking filesystem?  Then when reading
1024	 * holes of a sparse file, we actually need to allocate those pages,
1025	 * and even mark them dirty, so it cannot exceed the max_blocks limit.
1026	 */
1027	if (segment_eq(get_fs(), KERNEL_DS))
1028		sgp = SGP_DIRTY;
1029
1030	index = *ppos >> PAGE_CACHE_SHIFT;
1031	offset = *ppos & ~PAGE_CACHE_MASK;
1032
1033	for (;;) {
1034		struct page *page = NULL;
1035		unsigned long end_index, nr, ret;
1036		loff_t i_size = i_size_read(inode);
1037
1038		end_index = i_size >> PAGE_CACHE_SHIFT;
1039		if (index > end_index)
1040			break;
1041		if (index == end_index) {
1042			nr = i_size & ~PAGE_CACHE_MASK;
1043			if (nr <= offset)
1044				break;
1045		}
1046
1047		desc->error = shmem_getpage(inode, index, &page, sgp, NULL);
1048		if (desc->error) {
1049			if (desc->error == -EINVAL)
1050				desc->error = 0;
1051			break;
1052		}
1053		if (page)
1054			unlock_page(page);
1055
1056		/*
1057		 * We must evaluate after, since reads (unlike writes)
1058		 * are called without i_mutex protection against truncate
1059		 */
1060		nr = PAGE_CACHE_SIZE;
1061		i_size = i_size_read(inode);
1062		end_index = i_size >> PAGE_CACHE_SHIFT;
1063		if (index == end_index) {
1064			nr = i_size & ~PAGE_CACHE_MASK;
1065			if (nr <= offset) {
1066				if (page)
1067					page_cache_release(page);
1068				break;
1069			}
1070		}
1071		nr -= offset;
1072
1073		if (page) {
1074			/*
1075			 * If users can be writing to this page using arbitrary
1076			 * virtual addresses, take care about potential aliasing
1077			 * before reading the page on the kernel side.
1078			 */
1079			if (mapping_writably_mapped(mapping))
1080				flush_dcache_page(page);
1081			/*
1082			 * Mark the page accessed if we read the beginning.
1083			 */
1084			if (!offset)
1085				mark_page_accessed(page);
1086		} else {
1087			page = ZERO_PAGE(0);
1088			page_cache_get(page);
1089		}
1090
1091		/*
1092		 * Ok, we have the page, and it's up-to-date, so
1093		 * now we can copy it to user space...
1094		 *
1095		 * The actor routine returns how many bytes were actually used..
1096		 * NOTE! This may not be the same as how much of a user buffer
1097		 * we filled up (we may be padding etc), so we can only update
1098		 * "pos" here (the actor routine has to update the user buffer
1099		 * pointers and the remaining count).
1100		 */
1101		ret = actor(desc, page, offset, nr);
1102		offset += ret;
1103		index += offset >> PAGE_CACHE_SHIFT;
1104		offset &= ~PAGE_CACHE_MASK;
1105
1106		page_cache_release(page);
1107		if (ret != nr || !desc->count)
1108			break;
1109
1110		cond_resched();
1111	}
1112
1113	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1114	file_accessed(filp);
1115}
1116
1117static ssize_t shmem_file_aio_read(struct kiocb *iocb,
1118		const struct iovec *iov, unsigned long nr_segs, loff_t pos)
1119{
1120	struct file *filp = iocb->ki_filp;
1121	ssize_t retval;
1122	unsigned long seg;
1123	size_t count;
1124	loff_t *ppos = &iocb->ki_pos;
1125
1126	retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1127	if (retval)
1128		return retval;
1129
1130	for (seg = 0; seg < nr_segs; seg++) {
1131		read_descriptor_t desc;
1132
1133		desc.written = 0;
1134		desc.arg.buf = iov[seg].iov_base;
1135		desc.count = iov[seg].iov_len;
1136		if (desc.count == 0)
1137			continue;
1138		desc.error = 0;
1139		do_shmem_file_read(filp, ppos, &desc, file_read_actor);
1140		retval += desc.written;
1141		if (desc.error) {
1142			retval = retval ?: desc.error;
1143			break;
1144		}
1145		if (desc.count > 0)
1146			break;
1147	}
1148	return retval;
1149}
1150
1151static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1152				struct pipe_inode_info *pipe, size_t len,
1153				unsigned int flags)
1154{
1155	struct address_space *mapping = in->f_mapping;
1156	struct inode *inode = mapping->host;
1157	unsigned int loff, nr_pages, req_pages;
1158	struct page *pages[PIPE_DEF_BUFFERS];
1159	struct partial_page partial[PIPE_DEF_BUFFERS];
1160	struct page *page;
1161	pgoff_t index, end_index;
1162	loff_t isize, left;
1163	int error, page_nr;
1164	struct splice_pipe_desc spd = {
1165		.pages = pages,
1166		.partial = partial,
1167		.flags = flags,
1168		.ops = &page_cache_pipe_buf_ops,
1169		.spd_release = spd_release_page,
1170	};
1171
1172	isize = i_size_read(inode);
1173	if (unlikely(*ppos >= isize))
1174		return 0;
1175
1176	left = isize - *ppos;
1177	if (unlikely(left < len))
1178		len = left;
1179
1180	if (splice_grow_spd(pipe, &spd))
1181		return -ENOMEM;
1182
1183	index = *ppos >> PAGE_CACHE_SHIFT;
1184	loff = *ppos & ~PAGE_CACHE_MASK;
1185	req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1186	nr_pages = min(req_pages, pipe->buffers);
1187
1188	spd.nr_pages = find_get_pages_contig(mapping, index,
1189						nr_pages, spd.pages);
1190	index += spd.nr_pages;
1191	error = 0;
1192
1193	while (spd.nr_pages < nr_pages) {
1194		error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
1195		if (error)
1196			break;
1197		unlock_page(page);
1198		spd.pages[spd.nr_pages++] = page;
1199		index++;
1200	}
1201
1202	index = *ppos >> PAGE_CACHE_SHIFT;
1203	nr_pages = spd.nr_pages;
1204	spd.nr_pages = 0;
1205
1206	for (page_nr = 0; page_nr < nr_pages; page_nr++) {
1207		unsigned int this_len;
1208
1209		if (!len)
1210			break;
1211
1212		this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
1213		page = spd.pages[page_nr];
1214
1215		if (!PageUptodate(page) || page->mapping != mapping) {
1216			error = shmem_getpage(inode, index, &page,
1217							SGP_CACHE, NULL);
1218			if (error)
1219				break;
1220			unlock_page(page);
1221			page_cache_release(spd.pages[page_nr]);
1222			spd.pages[page_nr] = page;
1223		}
1224
1225		isize = i_size_read(inode);
1226		end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1227		if (unlikely(!isize || index > end_index))
1228			break;
1229
1230		if (end_index == index) {
1231			unsigned int plen;
1232
1233			plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
1234			if (plen <= loff)
1235				break;
1236
1237			this_len = min(this_len, plen - loff);
1238			len = this_len;
1239		}
1240
1241		spd.partial[page_nr].offset = loff;
1242		spd.partial[page_nr].len = this_len;
1243		len -= this_len;
1244		loff = 0;
1245		spd.nr_pages++;
1246		index++;
1247	}
1248
1249	while (page_nr < nr_pages)
1250		page_cache_release(spd.pages[page_nr++]);
1251
1252	if (spd.nr_pages)
1253		error = splice_to_pipe(pipe, &spd);
1254
1255	splice_shrink_spd(pipe, &spd);
1256
1257	if (error > 0) {
1258		*ppos += error;
1259		file_accessed(in);
1260	}
1261	return error;
1262}
1263
1264static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1265{
1266	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
1267
1268	buf->f_type = TMPFS_MAGIC;
1269	buf->f_bsize = PAGE_CACHE_SIZE;
1270	buf->f_namelen = NAME_MAX;
1271	if (sbinfo->max_blocks) {
1272		buf->f_blocks = sbinfo->max_blocks;
1273		buf->f_bavail = buf->f_bfree =
1274				sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks);
1275	}
1276	if (sbinfo->max_inodes) {
1277		buf->f_files = sbinfo->max_inodes;
1278		buf->f_ffree = sbinfo->free_inodes;
1279	}
1280	/* else leave those fields 0 like simple_statfs */
1281	return 0;
1282}
1283
1284/*
1285 * File creation. Allocate an inode, and we're done..
1286 */
1287static int
1288shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1289{
1290	struct inode *inode;
1291	int error = -ENOSPC;
1292
1293	inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
1294	if (inode) {
1295		error = security_inode_init_security(inode, dir,
1296						     &dentry->d_name, NULL,
1297						     NULL, NULL);
1298		if (error) {
1299			if (error != -EOPNOTSUPP) {
1300				iput(inode);
1301				return error;
1302			}
1303		}
1304#ifdef CONFIG_TMPFS_POSIX_ACL
1305		error = generic_acl_init(inode, dir);
1306		if (error) {
1307			iput(inode);
1308			return error;
1309		}
1310#else
1311		error = 0;
1312#endif
1313		dir->i_size += BOGO_DIRENT_SIZE;
1314		dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1315		d_instantiate(dentry, inode);
1316		dget(dentry); /* Extra count - pin the dentry in core */
1317	}
1318	return error;
1319}
1320
1321static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1322{
1323	int error;
1324
1325	if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
1326		return error;
1327	inc_nlink(dir);
1328	return 0;
1329}
1330
1331static int shmem_create(struct inode *dir, struct dentry *dentry, int mode,
1332		struct nameidata *nd)
1333{
1334	return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
1335}
1336
1337/*
1338 * Link a file..
1339 */
1340static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1341{
1342	struct inode *inode = old_dentry->d_inode;
1343	int ret;
1344
1345	/*
1346	 * No ordinary (disk based) filesystem counts links as inodes;
1347	 * but each new link needs a new dentry, pinning lowmem, and
1348	 * tmpfs dentries cannot be pruned until they are unlinked.
1349	 */
1350	ret = shmem_reserve_inode(inode->i_sb);
1351	if (ret)
1352		goto out;
1353
1354	dir->i_size += BOGO_DIRENT_SIZE;
1355	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1356	inc_nlink(inode);
1357	ihold(inode);	/* New dentry reference */
1358	dget(dentry);		/* Extra pinning count for the created dentry */
1359	d_instantiate(dentry, inode);
1360out:
1361	return ret;
1362}
1363
1364static int shmem_unlink(struct inode *dir, struct dentry *dentry)
1365{
1366	struct inode *inode = dentry->d_inode;
1367
1368	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
1369		shmem_free_inode(inode->i_sb);
1370
1371	dir->i_size -= BOGO_DIRENT_SIZE;
1372	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1373	drop_nlink(inode);
1374	dput(dentry);	/* Undo the count from "create" - this does all the work */
1375	return 0;
1376}
1377
1378static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
1379{
1380	if (!simple_empty(dentry))
1381		return -ENOTEMPTY;
1382
1383	drop_nlink(dentry->d_inode);
1384	drop_nlink(dir);
1385	return shmem_unlink(dir, dentry);
1386}
1387
1388/*
1389 * The VFS layer already does all the dentry stuff for rename,
1390 * we just have to decrement the usage count for the target if
1391 * it exists so that the VFS layer correctly free's it when it
1392 * gets overwritten.
1393 */
1394static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
1395{
1396	struct inode *inode = old_dentry->d_inode;
1397	int they_are_dirs = S_ISDIR(inode->i_mode);
1398
1399	if (!simple_empty(new_dentry))
1400		return -ENOTEMPTY;
1401
1402	if (new_dentry->d_inode) {
1403		(void) shmem_unlink(new_dir, new_dentry);
1404		if (they_are_dirs)
1405			drop_nlink(old_dir);
1406	} else if (they_are_dirs) {
1407		drop_nlink(old_dir);
1408		inc_nlink(new_dir);
1409	}
1410
1411	old_dir->i_size -= BOGO_DIRENT_SIZE;
1412	new_dir->i_size += BOGO_DIRENT_SIZE;
1413	old_dir->i_ctime = old_dir->i_mtime =
1414	new_dir->i_ctime = new_dir->i_mtime =
1415	inode->i_ctime = CURRENT_TIME;
1416	return 0;
1417}
1418
1419static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1420{
1421	int error;
1422	int len;
1423	struct inode *inode;
1424	struct page *page;
1425	char *kaddr;
1426	struct shmem_inode_info *info;
1427
1428	len = strlen(symname) + 1;
1429	if (len > PAGE_CACHE_SIZE)
1430		return -ENAMETOOLONG;
1431
1432	inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
1433	if (!inode)
1434		return -ENOSPC;
1435
1436	error = security_inode_init_security(inode, dir, &dentry->d_name, NULL,
1437					     NULL, NULL);
1438	if (error) {
1439		if (error != -EOPNOTSUPP) {
1440			iput(inode);
1441			return error;
1442		}
1443		error = 0;
1444	}
1445
1446	info = SHMEM_I(inode);
1447	inode->i_size = len-1;
1448	if (len <= SHMEM_SYMLINK_INLINE_LEN) {
1449		/* do it inline */
1450		memcpy(info->inline_symlink, symname, len);
1451		inode->i_op = &shmem_symlink_inline_operations;
1452	} else {
1453		error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
1454		if (error) {
1455			iput(inode);
1456			return error;
1457		}
1458		inode->i_mapping->a_ops = &shmem_aops;
1459		inode->i_op = &shmem_symlink_inode_operations;
1460		kaddr = kmap_atomic(page, KM_USER0);
1461		memcpy(kaddr, symname, len);
1462		kunmap_atomic(kaddr, KM_USER0);
1463		set_page_dirty(page);
1464		unlock_page(page);
1465		page_cache_release(page);
1466	}
1467	dir->i_size += BOGO_DIRENT_SIZE;
1468	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1469	d_instantiate(dentry, inode);
1470	dget(dentry);
1471	return 0;
1472}
1473
1474static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
1475{
1476	nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink);
1477	return NULL;
1478}
1479
1480static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
1481{
1482	struct page *page = NULL;
1483	int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
1484	nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
1485	if (page)
1486		unlock_page(page);
1487	return page;
1488}
1489
1490static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
1491{
1492	if (!IS_ERR(nd_get_link(nd))) {
1493		struct page *page = cookie;
1494		kunmap(page);
1495		mark_page_accessed(page);
1496		page_cache_release(page);
1497	}
1498}
1499
1500#ifdef CONFIG_TMPFS_XATTR
1501/*
1502 * Superblocks without xattr inode operations may get some security.* xattr
1503 * support from the LSM "for free". As soon as we have any other xattrs
1504 * like ACLs, we also need to implement the security.* handlers at
1505 * filesystem level, though.
1506 */
1507
1508static int shmem_xattr_get(struct dentry *dentry, const char *name,
1509			   void *buffer, size_t size)
1510{
1511	struct shmem_inode_info *info;
1512	struct shmem_xattr *xattr;
1513	int ret = -ENODATA;
1514
1515	info = SHMEM_I(dentry->d_inode);
1516
1517	spin_lock(&info->lock);
1518	list_for_each_entry(xattr, &info->xattr_list, list) {
1519		if (strcmp(name, xattr->name))
1520			continue;
1521
1522		ret = xattr->size;
1523		if (buffer) {
1524			if (size < xattr->size)
1525				ret = -ERANGE;
1526			else
1527				memcpy(buffer, xattr->value, xattr->size);
1528		}
1529		break;
1530	}
1531	spin_unlock(&info->lock);
1532	return ret;
1533}
1534
1535static int shmem_xattr_set(struct dentry *dentry, const char *name,
1536			   const void *value, size_t size, int flags)
1537{
1538	struct inode *inode = dentry->d_inode;
1539	struct shmem_inode_info *info = SHMEM_I(inode);
1540	struct shmem_xattr *xattr;
1541	struct shmem_xattr *new_xattr = NULL;
1542	size_t len;
1543	int err = 0;
1544
1545	/* value == NULL means remove */
1546	if (value) {
1547		/* wrap around? */
1548		len = sizeof(*new_xattr) + size;
1549		if (len <= sizeof(*new_xattr))
1550			return -ENOMEM;
1551
1552		new_xattr = kmalloc(len, GFP_KERNEL);
1553		if (!new_xattr)
1554			return -ENOMEM;
1555
1556		new_xattr->name = kstrdup(name, GFP_KERNEL);
1557		if (!new_xattr->name) {
1558			kfree(new_xattr);
1559			return -ENOMEM;
1560		}
1561
1562		new_xattr->size = size;
1563		memcpy(new_xattr->value, value, size);
1564	}
1565
1566	spin_lock(&info->lock);
1567	list_for_each_entry(xattr, &info->xattr_list, list) {
1568		if (!strcmp(name, xattr->name)) {
1569			if (flags & XATTR_CREATE) {
1570				xattr = new_xattr;
1571				err = -EEXIST;
1572			} else if (new_xattr) {
1573				list_replace(&xattr->list, &new_xattr->list);
1574			} else {
1575				list_del(&xattr->list);
1576			}
1577			goto out;
1578		}
1579	}
1580	if (flags & XATTR_REPLACE) {
1581		xattr = new_xattr;
1582		err = -ENODATA;
1583	} else {
1584		list_add(&new_xattr->list, &info->xattr_list);
1585		xattr = NULL;
1586	}
1587out:
1588	spin_unlock(&info->lock);
1589	if (xattr)
1590		kfree(xattr->name);
1591	kfree(xattr);
1592	return err;
1593}
1594
1595
1596static const struct xattr_handler *shmem_xattr_handlers[] = {
1597#ifdef CONFIG_TMPFS_POSIX_ACL
1598	&generic_acl_access_handler,
1599	&generic_acl_default_handler,
1600#endif
1601	NULL
1602};
1603
1604static int shmem_xattr_validate(const char *name)
1605{
1606	struct { const char *prefix; size_t len; } arr[] = {
1607		{ XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN },
1608		{ XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN }
1609	};
1610	int i;
1611
1612	for (i = 0; i < ARRAY_SIZE(arr); i++) {
1613		size_t preflen = arr[i].len;
1614		if (strncmp(name, arr[i].prefix, preflen) == 0) {
1615			if (!name[preflen])
1616				return -EINVAL;
1617			return 0;
1618		}
1619	}
1620	return -EOPNOTSUPP;
1621}
1622
1623static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
1624			      void *buffer, size_t size)
1625{
1626	int err;
1627
1628	/*
1629	 * If this is a request for a synthetic attribute in the system.*
1630	 * namespace use the generic infrastructure to resolve a handler
1631	 * for it via sb->s_xattr.
1632	 */
1633	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
1634		return generic_getxattr(dentry, name, buffer, size);
1635
1636	err = shmem_xattr_validate(name);
1637	if (err)
1638		return err;
1639
1640	return shmem_xattr_get(dentry, name, buffer, size);
1641}
1642
1643static int shmem_setxattr(struct dentry *dentry, const char *name,
1644			  const void *value, size_t size, int flags)
1645{
1646	int err;
1647
1648	/*
1649	 * If this is a request for a synthetic attribute in the system.*
1650	 * namespace use the generic infrastructure to resolve a handler
1651	 * for it via sb->s_xattr.
1652	 */
1653	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
1654		return generic_setxattr(dentry, name, value, size, flags);
1655
1656	err = shmem_xattr_validate(name);
1657	if (err)
1658		return err;
1659
1660	if (size == 0)
1661		value = "";  /* empty EA, do not remove */
1662
1663	return shmem_xattr_set(dentry, name, value, size, flags);
1664
1665}
1666
1667static int shmem_removexattr(struct dentry *dentry, const char *name)
1668{
1669	int err;
1670
1671	/*
1672	 * If this is a request for a synthetic attribute in the system.*
1673	 * namespace use the generic infrastructure to resolve a handler
1674	 * for it via sb->s_xattr.
1675	 */
1676	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
1677		return generic_removexattr(dentry, name);
1678
1679	err = shmem_xattr_validate(name);
1680	if (err)
1681		return err;
1682
1683	return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE);
1684}
1685
1686static bool xattr_is_trusted(const char *name)
1687{
1688	return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
1689}
1690
1691static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
1692{
1693	bool trusted = capable(CAP_SYS_ADMIN);
1694	struct shmem_xattr *xattr;
1695	struct shmem_inode_info *info;
1696	size_t used = 0;
1697
1698	info = SHMEM_I(dentry->d_inode);
1699
1700	spin_lock(&info->lock);
1701	list_for_each_entry(xattr, &info->xattr_list, list) {
1702		size_t len;
1703
1704		/* skip "trusted." attributes for unprivileged callers */
1705		if (!trusted && xattr_is_trusted(xattr->name))
1706			continue;
1707
1708		len = strlen(xattr->name) + 1;
1709		used += len;
1710		if (buffer) {
1711			if (size < used) {
1712				used = -ERANGE;
1713				break;
1714			}
1715			memcpy(buffer, xattr->name, len);
1716			buffer += len;
1717		}
1718	}
1719	spin_unlock(&info->lock);
1720
1721	return used;
1722}
1723#endif /* CONFIG_TMPFS_XATTR */
1724
1725static const struct inode_operations shmem_symlink_inline_operations = {
1726	.readlink	= generic_readlink,
1727	.follow_link	= shmem_follow_link_inline,
1728#ifdef CONFIG_TMPFS_XATTR
1729	.setxattr	= shmem_setxattr,
1730	.getxattr	= shmem_getxattr,
1731	.listxattr	= shmem_listxattr,
1732	.removexattr	= shmem_removexattr,
1733#endif
1734};
1735
1736static const struct inode_operations shmem_symlink_inode_operations = {
1737	.readlink	= generic_readlink,
1738	.follow_link	= shmem_follow_link,
1739	.put_link	= shmem_put_link,
1740#ifdef CONFIG_TMPFS_XATTR
1741	.setxattr	= shmem_setxattr,
1742	.getxattr	= shmem_getxattr,
1743	.listxattr	= shmem_listxattr,
1744	.removexattr	= shmem_removexattr,
1745#endif
1746};
1747
1748static struct dentry *shmem_get_parent(struct dentry *child)
1749{
1750	return ERR_PTR(-ESTALE);
1751}
1752
1753static int shmem_match(struct inode *ino, void *vfh)
1754{
1755	__u32 *fh = vfh;
1756	__u64 inum = fh[2];
1757	inum = (inum << 32) | fh[1];
1758	return ino->i_ino == inum && fh[0] == ino->i_generation;
1759}
1760
1761static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
1762		struct fid *fid, int fh_len, int fh_type)
1763{
1764	struct inode *inode;
1765	struct dentry *dentry = NULL;
1766	u64 inum = fid->raw[2];
1767	inum = (inum << 32) | fid->raw[1];
1768
1769	if (fh_len < 3)
1770		return NULL;
1771
1772	inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
1773			shmem_match, fid->raw);
1774	if (inode) {
1775		dentry = d_find_alias(inode);
1776		iput(inode);
1777	}
1778
1779	return dentry;
1780}
1781
1782static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
1783				int connectable)
1784{
1785	struct inode *inode = dentry->d_inode;
1786
1787	if (*len < 3) {
1788		*len = 3;
1789		return 255;
1790	}
1791
1792	if (inode_unhashed(inode)) {
1793		/* Unfortunately insert_inode_hash is not idempotent,
1794		 * so as we hash inodes here rather than at creation
1795		 * time, we need a lock to ensure we only try
1796		 * to do it once
1797		 */
1798		static DEFINE_SPINLOCK(lock);
1799		spin_lock(&lock);
1800		if (inode_unhashed(inode))
1801			__insert_inode_hash(inode,
1802					    inode->i_ino + inode->i_generation);
1803		spin_unlock(&lock);
1804	}
1805
1806	fh[0] = inode->i_generation;
1807	fh[1] = inode->i_ino;
1808	fh[2] = ((__u64)inode->i_ino) >> 32;
1809
1810	*len = 3;
1811	return 1;
1812}
1813
1814static const struct export_operations shmem_export_ops = {
1815	.get_parent     = shmem_get_parent,
1816	.encode_fh      = shmem_encode_fh,
1817	.fh_to_dentry	= shmem_fh_to_dentry,
1818};
1819
1820static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
1821			       bool remount)
1822{
1823	char *this_char, *value, *rest;
1824
1825	while (options != NULL) {
1826		this_char = options;
1827		for (;;) {
1828			/*
1829			 * NUL-terminate this option: unfortunately,
1830			 * mount options form a comma-separated list,
1831			 * but mpol's nodelist may also contain commas.
1832			 */
1833			options = strchr(options, ',');
1834			if (options == NULL)
1835				break;
1836			options++;
1837			if (!isdigit(*options)) {
1838				options[-1] = '\0';
1839				break;
1840			}
1841		}
1842		if (!*this_char)
1843			continue;
1844		if ((value = strchr(this_char,'=')) != NULL) {
1845			*value++ = 0;
1846		} else {
1847			printk(KERN_ERR
1848			    "tmpfs: No value for mount option '%s'\n",
1849			    this_char);
1850			return 1;
1851		}
1852
1853		if (!strcmp(this_char,"size")) {
1854			unsigned long long size;
1855			size = memparse(value,&rest);
1856			if (*rest == '%') {
1857				size <<= PAGE_SHIFT;
1858				size *= totalram_pages;
1859				do_div(size, 100);
1860				rest++;
1861			}
1862			if (*rest)
1863				goto bad_val;
1864			sbinfo->max_blocks =
1865				DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
1866		} else if (!strcmp(this_char,"nr_blocks")) {
1867			sbinfo->max_blocks = memparse(value, &rest);
1868			if (*rest)
1869				goto bad_val;
1870		} else if (!strcmp(this_char,"nr_inodes")) {
1871			sbinfo->max_inodes = memparse(value, &rest);
1872			if (*rest)
1873				goto bad_val;
1874		} else if (!strcmp(this_char,"mode")) {
1875			if (remount)
1876				continue;
1877			sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
1878			if (*rest)
1879				goto bad_val;
1880		} else if (!strcmp(this_char,"uid")) {
1881			if (remount)
1882				continue;
1883			sbinfo->uid = simple_strtoul(value, &rest, 0);
1884			if (*rest)
1885				goto bad_val;
1886		} else if (!strcmp(this_char,"gid")) {
1887			if (remount)
1888				continue;
1889			sbinfo->gid = simple_strtoul(value, &rest, 0);
1890			if (*rest)
1891				goto bad_val;
1892		} else if (!strcmp(this_char,"mpol")) {
1893			if (mpol_parse_str(value, &sbinfo->mpol, 1))
1894				goto bad_val;
1895		} else {
1896			printk(KERN_ERR "tmpfs: Bad mount option %s\n",
1897			       this_char);
1898			return 1;
1899		}
1900	}
1901	return 0;
1902
1903bad_val:
1904	printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
1905	       value, this_char);
1906	return 1;
1907
1908}
1909
1910static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
1911{
1912	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1913	struct shmem_sb_info config = *sbinfo;
1914	unsigned long inodes;
1915	int error = -EINVAL;
1916
1917	if (shmem_parse_options(data, &config, true))
1918		return error;
1919
1920	spin_lock(&sbinfo->stat_lock);
1921	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
1922	if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
1923		goto out;
1924	if (config.max_inodes < inodes)
1925		goto out;
1926	/*
1927	 * Those tests also disallow limited->unlimited while any are in
1928	 * use, so i_blocks will always be zero when max_blocks is zero;
1929	 * but we must separately disallow unlimited->limited, because
1930	 * in that case we have no record of how much is already in use.
1931	 */
1932	if (config.max_blocks && !sbinfo->max_blocks)
1933		goto out;
1934	if (config.max_inodes && !sbinfo->max_inodes)
1935		goto out;
1936
1937	error = 0;
1938	sbinfo->max_blocks  = config.max_blocks;
1939	sbinfo->max_inodes  = config.max_inodes;
1940	sbinfo->free_inodes = config.max_inodes - inodes;
1941
1942	mpol_put(sbinfo->mpol);
1943	sbinfo->mpol        = config.mpol;	/* transfers initial ref */
1944out:
1945	spin_unlock(&sbinfo->stat_lock);
1946	return error;
1947}
1948
1949static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs)
1950{
1951	struct shmem_sb_info *sbinfo = SHMEM_SB(vfs->mnt_sb);
1952
1953	if (sbinfo->max_blocks != shmem_default_max_blocks())
1954		seq_printf(seq, ",size=%luk",
1955			sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10));
1956	if (sbinfo->max_inodes != shmem_default_max_inodes())
1957		seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
1958	if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
1959		seq_printf(seq, ",mode=%03o", sbinfo->mode);
1960	if (sbinfo->uid != 0)
1961		seq_printf(seq, ",uid=%u", sbinfo->uid);
1962	if (sbinfo->gid != 0)
1963		seq_printf(seq, ",gid=%u", sbinfo->gid);
1964	shmem_show_mpol(seq, sbinfo->mpol);
1965	return 0;
1966}
1967#endif /* CONFIG_TMPFS */
1968
1969static void shmem_put_super(struct super_block *sb)
1970{
1971	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1972
1973	percpu_counter_destroy(&sbinfo->used_blocks);
1974	kfree(sbinfo);
1975	sb->s_fs_info = NULL;
1976}
1977
1978int shmem_fill_super(struct super_block *sb, void *data, int silent)
1979{
1980	struct inode *inode;
1981	struct dentry *root;
1982	struct shmem_sb_info *sbinfo;
1983	int err = -ENOMEM;
1984
1985	/* Round up to L1_CACHE_BYTES to resist false sharing */
1986	sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
1987				L1_CACHE_BYTES), GFP_KERNEL);
1988	if (!sbinfo)
1989		return -ENOMEM;
1990
1991	sbinfo->mode = S_IRWXUGO | S_ISVTX;
1992	sbinfo->uid = current_fsuid();
1993	sbinfo->gid = current_fsgid();
1994	sb->s_fs_info = sbinfo;
1995
1996#ifdef CONFIG_TMPFS
1997	/*
1998	 * Per default we only allow half of the physical ram per
1999	 * tmpfs instance, limiting inodes to one per page of lowmem;
2000	 * but the internal instance is left unlimited.
2001	 */
2002	if (!(sb->s_flags & MS_NOUSER)) {
2003		sbinfo->max_blocks = shmem_default_max_blocks();
2004		sbinfo->max_inodes = shmem_default_max_inodes();
2005		if (shmem_parse_options(data, sbinfo, false)) {
2006			err = -EINVAL;
2007			goto failed;
2008		}
2009	}
2010	sb->s_export_op = &shmem_export_ops;
2011#else
2012	sb->s_flags |= MS_NOUSER;
2013#endif
2014
2015	spin_lock_init(&sbinfo->stat_lock);
2016	if (percpu_counter_init(&sbinfo->used_blocks, 0))
2017		goto failed;
2018	sbinfo->free_inodes = sbinfo->max_inodes;
2019
2020	sb->s_maxbytes = MAX_LFS_FILESIZE;
2021	sb->s_blocksize = PAGE_CACHE_SIZE;
2022	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
2023	sb->s_magic = TMPFS_MAGIC;
2024	sb->s_op = &shmem_ops;
2025	sb->s_time_gran = 1;
2026#ifdef CONFIG_TMPFS_XATTR
2027	sb->s_xattr = shmem_xattr_handlers;
2028#endif
2029#ifdef CONFIG_TMPFS_POSIX_ACL
2030	sb->s_flags |= MS_POSIXACL;
2031#endif
2032
2033	inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
2034	if (!inode)
2035		goto failed;
2036	inode->i_uid = sbinfo->uid;
2037	inode->i_gid = sbinfo->gid;
2038	root = d_alloc_root(inode);
2039	if (!root)
2040		goto failed_iput;
2041	sb->s_root = root;
2042	return 0;
2043
2044failed_iput:
2045	iput(inode);
2046failed:
2047	shmem_put_super(sb);
2048	return err;
2049}
2050
2051static struct kmem_cache *shmem_inode_cachep;
2052
2053static struct inode *shmem_alloc_inode(struct super_block *sb)
2054{
2055	struct shmem_inode_info *p;
2056	p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
2057	if (!p)
2058		return NULL;
2059	return &p->vfs_inode;
2060}
2061
2062static void shmem_i_callback(struct rcu_head *head)
2063{
2064	struct inode *inode = container_of(head, struct inode, i_rcu);
2065	INIT_LIST_HEAD(&inode->i_dentry);
2066	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2067}
2068
2069static void shmem_destroy_inode(struct inode *inode)
2070{
2071	if ((inode->i_mode & S_IFMT) == S_IFREG) {
2072		/* only struct inode is valid if it's an inline symlink */
2073		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2074	}
2075	call_rcu(&inode->i_rcu, shmem_i_callback);
2076}
2077
2078static void init_once(void *foo)
2079{
2080	struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2081
2082	inode_init_once(&p->vfs_inode);
2083}
2084
2085static int init_inodecache(void)
2086{
2087	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
2088				sizeof(struct shmem_inode_info),
2089				0, SLAB_PANIC, init_once);
2090	return 0;
2091}
2092
2093static void destroy_inodecache(void)
2094{
2095	kmem_cache_destroy(shmem_inode_cachep);
2096}
2097
2098static const struct address_space_operations shmem_aops = {
2099	.writepage	= shmem_writepage,
2100	.set_page_dirty	= __set_page_dirty_no_writeback,
2101#ifdef CONFIG_TMPFS
2102	.write_begin	= shmem_write_begin,
2103	.write_end	= shmem_write_end,
2104#endif
2105	.migratepage	= migrate_page,
2106	.error_remove_page = generic_error_remove_page,
2107};
2108
2109static const struct file_operations shmem_file_operations = {
2110	.mmap		= shmem_mmap,
2111#ifdef CONFIG_TMPFS
2112	.llseek		= generic_file_llseek,
2113	.read		= do_sync_read,
2114	.write		= do_sync_write,
2115	.aio_read	= shmem_file_aio_read,
2116	.aio_write	= generic_file_aio_write,
2117	.fsync		= noop_fsync,
2118	.splice_read	= shmem_file_splice_read,
2119	.splice_write	= generic_file_splice_write,
2120#endif
2121};
2122
2123static const struct inode_operations shmem_inode_operations = {
2124	.setattr	= shmem_setattr,
2125	.truncate_range	= shmem_truncate_range,
2126#ifdef CONFIG_TMPFS_XATTR
2127	.setxattr	= shmem_setxattr,
2128	.getxattr	= shmem_getxattr,
2129	.listxattr	= shmem_listxattr,
2130	.removexattr	= shmem_removexattr,
2131#endif
2132};
2133
2134static const struct inode_operations shmem_dir_inode_operations = {
2135#ifdef CONFIG_TMPFS
2136	.create		= shmem_create,
2137	.lookup		= simple_lookup,
2138	.link		= shmem_link,
2139	.unlink		= shmem_unlink,
2140	.symlink	= shmem_symlink,
2141	.mkdir		= shmem_mkdir,
2142	.rmdir		= shmem_rmdir,
2143	.mknod		= shmem_mknod,
2144	.rename		= shmem_rename,
2145#endif
2146#ifdef CONFIG_TMPFS_XATTR
2147	.setxattr	= shmem_setxattr,
2148	.getxattr	= shmem_getxattr,
2149	.listxattr	= shmem_listxattr,
2150	.removexattr	= shmem_removexattr,
2151#endif
2152#ifdef CONFIG_TMPFS_POSIX_ACL
2153	.setattr	= shmem_setattr,
2154#endif
2155};
2156
2157static const struct inode_operations shmem_special_inode_operations = {
2158#ifdef CONFIG_TMPFS_XATTR
2159	.setxattr	= shmem_setxattr,
2160	.getxattr	= shmem_getxattr,
2161	.listxattr	= shmem_listxattr,
2162	.removexattr	= shmem_removexattr,
2163#endif
2164#ifdef CONFIG_TMPFS_POSIX_ACL
2165	.setattr	= shmem_setattr,
2166#endif
2167};
2168
2169static const struct super_operations shmem_ops = {
2170	.alloc_inode	= shmem_alloc_inode,
2171	.destroy_inode	= shmem_destroy_inode,
2172#ifdef CONFIG_TMPFS
2173	.statfs		= shmem_statfs,
2174	.remount_fs	= shmem_remount_fs,
2175	.show_options	= shmem_show_options,
2176#endif
2177	.evict_inode	= shmem_evict_inode,
2178	.drop_inode	= generic_delete_inode,
2179	.put_super	= shmem_put_super,
2180};
2181
2182static const struct vm_operations_struct shmem_vm_ops = {
2183	.fault		= shmem_fault,
2184#ifdef CONFIG_NUMA
2185	.set_policy     = shmem_set_policy,
2186	.get_policy     = shmem_get_policy,
2187#endif
2188};
2189
2190
2191static struct dentry *shmem_mount(struct file_system_type *fs_type,
2192	int flags, const char *dev_name, void *data)
2193{
2194	return mount_nodev(fs_type, flags, data, shmem_fill_super);
2195}
2196
2197static struct file_system_type tmpfs_fs_type = {
2198	.owner		= THIS_MODULE,
2199	.name		= "tmpfs",
2200	.mount		= shmem_mount,
2201	.kill_sb	= kill_litter_super,
2202};
2203
2204int __init init_tmpfs(void)
2205{
2206	int error;
2207
2208	error = bdi_init(&shmem_backing_dev_info);
2209	if (error)
2210		goto out4;
2211
2212	error = init_inodecache();
2213	if (error)
2214		goto out3;
2215
2216	error = register_filesystem(&tmpfs_fs_type);
2217	if (error) {
2218		printk(KERN_ERR "Could not register tmpfs\n");
2219		goto out2;
2220	}
2221
2222	shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER,
2223				tmpfs_fs_type.name, NULL);
2224	if (IS_ERR(shm_mnt)) {
2225		error = PTR_ERR(shm_mnt);
2226		printk(KERN_ERR "Could not kern_mount tmpfs\n");
2227		goto out1;
2228	}
2229	return 0;
2230
2231out1:
2232	unregister_filesystem(&tmpfs_fs_type);
2233out2:
2234	destroy_inodecache();
2235out3:
2236	bdi_destroy(&shmem_backing_dev_info);
2237out4:
2238	shm_mnt = ERR_PTR(error);
2239	return error;
2240}
2241
2242#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2243/**
2244 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
2245 * @inode: the inode to be searched
2246 * @pgoff: the offset to be searched
2247 * @pagep: the pointer for the found page to be stored
2248 * @ent: the pointer for the found swap entry to be stored
2249 *
2250 * If a page is found, refcount of it is incremented. Callers should handle
2251 * these refcount.
2252 */
2253void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
2254					struct page **pagep, swp_entry_t *ent)
2255{
2256	swp_entry_t entry = { .val = 0 };
2257	struct page *page = NULL;
2258	struct shmem_inode_info *info = SHMEM_I(inode);
2259
2260	if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
2261		goto out;
2262
2263	spin_lock(&info->lock);
2264#ifdef CONFIG_SWAP
2265	entry = shmem_get_swap(info, pgoff);
2266	if (entry.val)
2267		page = find_get_page(&swapper_space, entry.val);
2268	else
2269#endif
2270		page = find_get_page(inode->i_mapping, pgoff);
2271	spin_unlock(&info->lock);
2272out:
2273	*pagep = page;
2274	*ent = entry;
2275}
2276#endif
2277
2278#else /* !CONFIG_SHMEM */
2279
2280/*
2281 * tiny-shmem: simple shmemfs and tmpfs using ramfs code
2282 *
2283 * This is intended for small system where the benefits of the full
2284 * shmem code (swap-backed and resource-limited) are outweighed by
2285 * their complexity. On systems without swap this code should be
2286 * effectively equivalent, but much lighter weight.
2287 */
2288
2289#include <linux/ramfs.h>
2290
2291static struct file_system_type tmpfs_fs_type = {
2292	.name		= "tmpfs",
2293	.mount		= ramfs_mount,
2294	.kill_sb	= kill_litter_super,
2295};
2296
2297int __init init_tmpfs(void)
2298{
2299	BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
2300
2301	shm_mnt = kern_mount(&tmpfs_fs_type);
2302	BUG_ON(IS_ERR(shm_mnt));
2303
2304	return 0;
2305}
2306
2307int shmem_unuse(swp_entry_t entry, struct page *page)
2308{
2309	return 0;
2310}
2311
2312int shmem_lock(struct file *file, int lock, struct user_struct *user)
2313{
2314	return 0;
2315}
2316
2317void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
2318{
2319	truncate_inode_pages_range(inode->i_mapping, start, end);
2320}
2321EXPORT_SYMBOL_GPL(shmem_truncate_range);
2322
2323#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2324/**
2325 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
2326 * @inode: the inode to be searched
2327 * @pgoff: the offset to be searched
2328 * @pagep: the pointer for the found page to be stored
2329 * @ent: the pointer for the found swap entry to be stored
2330 *
2331 * If a page is found, refcount of it is incremented. Callers should handle
2332 * these refcount.
2333 */
2334void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
2335					struct page **pagep, swp_entry_t *ent)
2336{
2337	struct page *page = NULL;
2338
2339	if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
2340		goto out;
2341	page = find_get_page(inode->i_mapping, pgoff);
2342out:
2343	*pagep = page;
2344	*ent = (swp_entry_t){ .val = 0 };
2345}
2346#endif
2347
2348#define shmem_vm_ops				generic_file_vm_ops
2349#define shmem_file_operations			ramfs_file_operations
2350#define shmem_get_inode(sb, dir, mode, dev, flags)	ramfs_get_inode(sb, dir, mode, dev)
2351#define shmem_acct_size(flags, size)		0
2352#define shmem_unacct_size(flags, size)		do {} while (0)
2353
2354#endif /* CONFIG_SHMEM */
2355
2356/* common code */
2357
2358/**
2359 * shmem_file_setup - get an unlinked file living in tmpfs
2360 * @name: name for dentry (to be seen in /proc/<pid>/maps
2361 * @size: size to be set for the file
2362 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
2363 */
2364struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
2365{
2366	int error;
2367	struct file *file;
2368	struct inode *inode;
2369	struct path path;
2370	struct dentry *root;
2371	struct qstr this;
2372
2373	if (IS_ERR(shm_mnt))
2374		return (void *)shm_mnt;
2375
2376	if (size < 0 || size > MAX_LFS_FILESIZE)
2377		return ERR_PTR(-EINVAL);
2378
2379	if (shmem_acct_size(flags, size))
2380		return ERR_PTR(-ENOMEM);
2381
2382	error = -ENOMEM;
2383	this.name = name;
2384	this.len = strlen(name);
2385	this.hash = 0; /* will go */
2386	root = shm_mnt->mnt_root;
2387	path.dentry = d_alloc(root, &this);
2388	if (!path.dentry)
2389		goto put_memory;
2390	path.mnt = mntget(shm_mnt);
2391
2392	error = -ENOSPC;
2393	inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
2394	if (!inode)
2395		goto put_dentry;
2396
2397	d_instantiate(path.dentry, inode);
2398	inode->i_size = size;
2399	inode->i_nlink = 0;	/* It is unlinked */
2400#ifndef CONFIG_MMU
2401	error = ramfs_nommu_expand_for_mapping(inode, size);
2402	if (error)
2403		goto put_dentry;
2404#endif
2405
2406	error = -ENFILE;
2407	file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
2408		  &shmem_file_operations);
2409	if (!file)
2410		goto put_dentry;
2411
2412	return file;
2413
2414put_dentry:
2415	path_put(&path);
2416put_memory:
2417	shmem_unacct_size(flags, size);
2418	return ERR_PTR(error);
2419}
2420EXPORT_SYMBOL_GPL(shmem_file_setup);
2421
2422/**
2423 * shmem_zero_setup - setup a shared anonymous mapping
2424 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
2425 */
2426int shmem_zero_setup(struct vm_area_struct *vma)
2427{
2428	struct file *file;
2429	loff_t size = vma->vm_end - vma->vm_start;
2430
2431	file = shmem_file_setup("dev/zero", size, vma->vm_flags);
2432	if (IS_ERR(file))
2433		return PTR_ERR(file);
2434
2435	if (vma->vm_file)
2436		fput(vma->vm_file);
2437	vma->vm_file = file;
2438	vma->vm_ops = &shmem_vm_ops;
2439	vma->vm_flags |= VM_CAN_NONLINEAR;
2440	return 0;
2441}
2442
2443/**
2444 * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
2445 * @mapping:	the page's address_space
2446 * @index:	the page index
2447 * @gfp:	the page allocator flags to use if allocating
2448 *
2449 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
2450 * with any new page allocations done using the specified allocation flags.
2451 * But read_cache_page_gfp() uses the ->readpage() method: which does not
2452 * suit tmpfs, since it may have pages in swapcache, and needs to find those
2453 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
2454 *
2455 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
2456 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
2457 */
2458struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
2459					 pgoff_t index, gfp_t gfp)
2460{
2461#ifdef CONFIG_SHMEM
2462	struct inode *inode = mapping->host;
2463	struct page *page;
2464	int error;
2465
2466	BUG_ON(mapping->a_ops != &shmem_aops);
2467	error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
2468	if (error)
2469		page = ERR_PTR(error);
2470	else
2471		unlock_page(page);
2472	return page;
2473#else
2474	/*
2475	 * The tiny !SHMEM case uses ramfs without swap
2476	 */
2477	return read_cache_page_gfp(mapping, index, gfp);
2478#endif
2479}
2480EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
2481