shmem.c revision 767193253bbac889e176f90b6f17b7015f986551
1/*
2 * Resizable virtual memory filesystem for Linux.
3 *
4 * Copyright (C) 2000 Linus Torvalds.
5 *		 2000 Transmeta Corp.
6 *		 2000-2001 Christoph Rohland
7 *		 2000-2001 SAP AG
8 *		 2002 Red Hat Inc.
9 * Copyright (C) 2002-2005 Hugh Dickins.
10 * Copyright (C) 2002-2005 VERITAS Software Corporation.
11 * Copyright (C) 2004 Andi Kleen, SuSE Labs
12 *
13 * Extended attribute support for tmpfs:
14 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
15 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
16 *
17 * This file is released under the GPL.
18 */
19
20/*
21 * This virtual memory filesystem is heavily based on the ramfs. It
22 * extends ramfs by the ability to use swap and honor resource limits
23 * which makes it a completely usable filesystem.
24 */
25
26#include <linux/module.h>
27#include <linux/init.h>
28#include <linux/fs.h>
29#include <linux/xattr.h>
30#include <linux/generic_acl.h>
31#include <linux/mm.h>
32#include <linux/mman.h>
33#include <linux/file.h>
34#include <linux/swap.h>
35#include <linux/pagemap.h>
36#include <linux/string.h>
37#include <linux/slab.h>
38#include <linux/backing-dev.h>
39#include <linux/shmem_fs.h>
40#include <linux/mount.h>
41#include <linux/writeback.h>
42#include <linux/vfs.h>
43#include <linux/blkdev.h>
44#include <linux/security.h>
45#include <linux/swapops.h>
46#include <linux/mempolicy.h>
47#include <linux/namei.h>
48#include <linux/ctype.h>
49#include <linux/migrate.h>
50#include <linux/highmem.h>
51#include <linux/backing-dev.h>
52
53#include <asm/uaccess.h>
54#include <asm/div64.h>
55#include <asm/pgtable.h>
56
57/* This magic number is used in glibc for posix shared memory */
58#define TMPFS_MAGIC	0x01021994
59
60#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
61#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
62#define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
63
64#define SHMEM_MAX_INDEX  (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
65#define SHMEM_MAX_BYTES  ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT)
66
67#define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
68
69/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
70#define SHMEM_PAGEIN	 VM_READ
71#define SHMEM_TRUNCATE	 VM_WRITE
72
73/* Definition to limit shmem_truncate's steps between cond_rescheds */
74#define LATENCY_LIMIT	 64
75
76/* Pretend that each entry is of this size in directory's i_size */
77#define BOGO_DIRENT_SIZE 20
78
79/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
80enum sgp_type {
81	SGP_QUICK,	/* don't try more than file page cache lookup */
82	SGP_READ,	/* don't exceed i_size, don't allocate page */
83	SGP_CACHE,	/* don't exceed i_size, may allocate page */
84	SGP_WRITE,	/* may exceed i_size, may allocate page */
85};
86
87static int shmem_getpage(struct inode *inode, unsigned long idx,
88			 struct page **pagep, enum sgp_type sgp, int *type);
89
90static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
91{
92	/*
93	 * The above definition of ENTRIES_PER_PAGE, and the use of
94	 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
95	 * might be reconsidered if it ever diverges from PAGE_SIZE.
96	 */
97	return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
98}
99
100static inline void shmem_dir_free(struct page *page)
101{
102	__free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
103}
104
105static struct page **shmem_dir_map(struct page *page)
106{
107	return (struct page **)kmap_atomic(page, KM_USER0);
108}
109
110static inline void shmem_dir_unmap(struct page **dir)
111{
112	kunmap_atomic(dir, KM_USER0);
113}
114
115static swp_entry_t *shmem_swp_map(struct page *page)
116{
117	return (swp_entry_t *)kmap_atomic(page, KM_USER1);
118}
119
120static inline void shmem_swp_balance_unmap(void)
121{
122	/*
123	 * When passing a pointer to an i_direct entry, to code which
124	 * also handles indirect entries and so will shmem_swp_unmap,
125	 * we must arrange for the preempt count to remain in balance.
126	 * What kmap_atomic of a lowmem page does depends on config
127	 * and architecture, so pretend to kmap_atomic some lowmem page.
128	 */
129	(void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
130}
131
132static inline void shmem_swp_unmap(swp_entry_t *entry)
133{
134	kunmap_atomic(entry, KM_USER1);
135}
136
137static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
138{
139	return sb->s_fs_info;
140}
141
142/*
143 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
144 * for shared memory and for shared anonymous (/dev/zero) mappings
145 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
146 * consistent with the pre-accounting of private mappings ...
147 */
148static inline int shmem_acct_size(unsigned long flags, loff_t size)
149{
150	return (flags & VM_ACCOUNT)?
151		security_vm_enough_memory(VM_ACCT(size)): 0;
152}
153
154static inline void shmem_unacct_size(unsigned long flags, loff_t size)
155{
156	if (flags & VM_ACCOUNT)
157		vm_unacct_memory(VM_ACCT(size));
158}
159
160/*
161 * ... whereas tmpfs objects are accounted incrementally as
162 * pages are allocated, in order to allow huge sparse files.
163 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
164 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
165 */
166static inline int shmem_acct_block(unsigned long flags)
167{
168	return (flags & VM_ACCOUNT)?
169		0: security_vm_enough_memory(VM_ACCT(PAGE_CACHE_SIZE));
170}
171
172static inline void shmem_unacct_blocks(unsigned long flags, long pages)
173{
174	if (!(flags & VM_ACCOUNT))
175		vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
176}
177
178static struct super_operations shmem_ops;
179static const struct address_space_operations shmem_aops;
180static const struct file_operations shmem_file_operations;
181static struct inode_operations shmem_inode_operations;
182static struct inode_operations shmem_dir_inode_operations;
183static struct inode_operations shmem_special_inode_operations;
184static struct vm_operations_struct shmem_vm_ops;
185
186static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
187	.ra_pages	= 0,	/* No readahead */
188	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
189	.unplug_io_fn	= default_unplug_io_fn,
190};
191
192static LIST_HEAD(shmem_swaplist);
193static DEFINE_SPINLOCK(shmem_swaplist_lock);
194
195static void shmem_free_blocks(struct inode *inode, long pages)
196{
197	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
198	if (sbinfo->max_blocks) {
199		spin_lock(&sbinfo->stat_lock);
200		sbinfo->free_blocks += pages;
201		inode->i_blocks -= pages*BLOCKS_PER_PAGE;
202		spin_unlock(&sbinfo->stat_lock);
203	}
204}
205
206/*
207 * shmem_recalc_inode - recalculate the size of an inode
208 *
209 * @inode: inode to recalc
210 *
211 * We have to calculate the free blocks since the mm can drop
212 * undirtied hole pages behind our back.
213 *
214 * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
215 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
216 *
217 * It has to be called with the spinlock held.
218 */
219static void shmem_recalc_inode(struct inode *inode)
220{
221	struct shmem_inode_info *info = SHMEM_I(inode);
222	long freed;
223
224	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
225	if (freed > 0) {
226		info->alloced -= freed;
227		shmem_unacct_blocks(info->flags, freed);
228		shmem_free_blocks(inode, freed);
229	}
230}
231
232/*
233 * shmem_swp_entry - find the swap vector position in the info structure
234 *
235 * @info:  info structure for the inode
236 * @index: index of the page to find
237 * @page:  optional page to add to the structure. Has to be preset to
238 *         all zeros
239 *
240 * If there is no space allocated yet it will return NULL when
241 * page is NULL, else it will use the page for the needed block,
242 * setting it to NULL on return to indicate that it has been used.
243 *
244 * The swap vector is organized the following way:
245 *
246 * There are SHMEM_NR_DIRECT entries directly stored in the
247 * shmem_inode_info structure. So small files do not need an addional
248 * allocation.
249 *
250 * For pages with index > SHMEM_NR_DIRECT there is the pointer
251 * i_indirect which points to a page which holds in the first half
252 * doubly indirect blocks, in the second half triple indirect blocks:
253 *
254 * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
255 * following layout (for SHMEM_NR_DIRECT == 16):
256 *
257 * i_indirect -> dir --> 16-19
258 * 	      |	     +-> 20-23
259 * 	      |
260 * 	      +-->dir2 --> 24-27
261 * 	      |	       +-> 28-31
262 * 	      |	       +-> 32-35
263 * 	      |	       +-> 36-39
264 * 	      |
265 * 	      +-->dir3 --> 40-43
266 * 	       	       +-> 44-47
267 * 	      	       +-> 48-51
268 * 	      	       +-> 52-55
269 */
270static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
271{
272	unsigned long offset;
273	struct page **dir;
274	struct page *subdir;
275
276	if (index < SHMEM_NR_DIRECT) {
277		shmem_swp_balance_unmap();
278		return info->i_direct+index;
279	}
280	if (!info->i_indirect) {
281		if (page) {
282			info->i_indirect = *page;
283			*page = NULL;
284		}
285		return NULL;			/* need another page */
286	}
287
288	index -= SHMEM_NR_DIRECT;
289	offset = index % ENTRIES_PER_PAGE;
290	index /= ENTRIES_PER_PAGE;
291	dir = shmem_dir_map(info->i_indirect);
292
293	if (index >= ENTRIES_PER_PAGE/2) {
294		index -= ENTRIES_PER_PAGE/2;
295		dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
296		index %= ENTRIES_PER_PAGE;
297		subdir = *dir;
298		if (!subdir) {
299			if (page) {
300				*dir = *page;
301				*page = NULL;
302			}
303			shmem_dir_unmap(dir);
304			return NULL;		/* need another page */
305		}
306		shmem_dir_unmap(dir);
307		dir = shmem_dir_map(subdir);
308	}
309
310	dir += index;
311	subdir = *dir;
312	if (!subdir) {
313		if (!page || !(subdir = *page)) {
314			shmem_dir_unmap(dir);
315			return NULL;		/* need a page */
316		}
317		*dir = subdir;
318		*page = NULL;
319	}
320	shmem_dir_unmap(dir);
321	return shmem_swp_map(subdir) + offset;
322}
323
324static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
325{
326	long incdec = value? 1: -1;
327
328	entry->val = value;
329	info->swapped += incdec;
330	if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
331		struct page *page = kmap_atomic_to_page(entry);
332		set_page_private(page, page_private(page) + incdec);
333	}
334}
335
336/*
337 * shmem_swp_alloc - get the position of the swap entry for the page.
338 *                   If it does not exist allocate the entry.
339 *
340 * @info:	info structure for the inode
341 * @index:	index of the page to find
342 * @sgp:	check and recheck i_size? skip allocation?
343 */
344static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
345{
346	struct inode *inode = &info->vfs_inode;
347	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
348	struct page *page = NULL;
349	swp_entry_t *entry;
350
351	if (sgp != SGP_WRITE &&
352	    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
353		return ERR_PTR(-EINVAL);
354
355	while (!(entry = shmem_swp_entry(info, index, &page))) {
356		if (sgp == SGP_READ)
357			return shmem_swp_map(ZERO_PAGE(0));
358		/*
359		 * Test free_blocks against 1 not 0, since we have 1 data
360		 * page (and perhaps indirect index pages) yet to allocate:
361		 * a waste to allocate index if we cannot allocate data.
362		 */
363		if (sbinfo->max_blocks) {
364			spin_lock(&sbinfo->stat_lock);
365			if (sbinfo->free_blocks <= 1) {
366				spin_unlock(&sbinfo->stat_lock);
367				return ERR_PTR(-ENOSPC);
368			}
369			sbinfo->free_blocks--;
370			inode->i_blocks += BLOCKS_PER_PAGE;
371			spin_unlock(&sbinfo->stat_lock);
372		}
373
374		spin_unlock(&info->lock);
375		page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
376		if (page)
377			set_page_private(page, 0);
378		spin_lock(&info->lock);
379
380		if (!page) {
381			shmem_free_blocks(inode, 1);
382			return ERR_PTR(-ENOMEM);
383		}
384		if (sgp != SGP_WRITE &&
385		    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
386			entry = ERR_PTR(-EINVAL);
387			break;
388		}
389		if (info->next_index <= index)
390			info->next_index = index + 1;
391	}
392	if (page) {
393		/* another task gave its page, or truncated the file */
394		shmem_free_blocks(inode, 1);
395		shmem_dir_free(page);
396	}
397	if (info->next_index <= index && !IS_ERR(entry))
398		info->next_index = index + 1;
399	return entry;
400}
401
402/*
403 * shmem_free_swp - free some swap entries in a directory
404 *
405 * @dir:   pointer to the directory
406 * @edir:  pointer after last entry of the directory
407 */
408static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir)
409{
410	swp_entry_t *ptr;
411	int freed = 0;
412
413	for (ptr = dir; ptr < edir; ptr++) {
414		if (ptr->val) {
415			free_swap_and_cache(*ptr);
416			*ptr = (swp_entry_t){0};
417			freed++;
418		}
419	}
420	return freed;
421}
422
423static int shmem_map_and_free_swp(struct page *subdir,
424		int offset, int limit, struct page ***dir)
425{
426	swp_entry_t *ptr;
427	int freed = 0;
428
429	ptr = shmem_swp_map(subdir);
430	for (; offset < limit; offset += LATENCY_LIMIT) {
431		int size = limit - offset;
432		if (size > LATENCY_LIMIT)
433			size = LATENCY_LIMIT;
434		freed += shmem_free_swp(ptr+offset, ptr+offset+size);
435		if (need_resched()) {
436			shmem_swp_unmap(ptr);
437			if (*dir) {
438				shmem_dir_unmap(*dir);
439				*dir = NULL;
440			}
441			cond_resched();
442			ptr = shmem_swp_map(subdir);
443		}
444	}
445	shmem_swp_unmap(ptr);
446	return freed;
447}
448
449static void shmem_free_pages(struct list_head *next)
450{
451	struct page *page;
452	int freed = 0;
453
454	do {
455		page = container_of(next, struct page, lru);
456		next = next->next;
457		shmem_dir_free(page);
458		freed++;
459		if (freed >= LATENCY_LIMIT) {
460			cond_resched();
461			freed = 0;
462		}
463	} while (next);
464}
465
466static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
467{
468	struct shmem_inode_info *info = SHMEM_I(inode);
469	unsigned long idx;
470	unsigned long size;
471	unsigned long limit;
472	unsigned long stage;
473	unsigned long diroff;
474	struct page **dir;
475	struct page *topdir;
476	struct page *middir;
477	struct page *subdir;
478	swp_entry_t *ptr;
479	LIST_HEAD(pages_to_free);
480	long nr_pages_to_free = 0;
481	long nr_swaps_freed = 0;
482	int offset;
483	int freed;
484	int punch_hole = 0;
485
486	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
487	idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
488	if (idx >= info->next_index)
489		return;
490
491	spin_lock(&info->lock);
492	info->flags |= SHMEM_TRUNCATE;
493	if (likely(end == (loff_t) -1)) {
494		limit = info->next_index;
495		info->next_index = idx;
496	} else {
497		limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
498		if (limit > info->next_index)
499			limit = info->next_index;
500		punch_hole = 1;
501	}
502
503	topdir = info->i_indirect;
504	if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
505		info->i_indirect = NULL;
506		nr_pages_to_free++;
507		list_add(&topdir->lru, &pages_to_free);
508	}
509	spin_unlock(&info->lock);
510
511	if (info->swapped && idx < SHMEM_NR_DIRECT) {
512		ptr = info->i_direct;
513		size = limit;
514		if (size > SHMEM_NR_DIRECT)
515			size = SHMEM_NR_DIRECT;
516		nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size);
517	}
518
519	/*
520	 * If there are no indirect blocks or we are punching a hole
521	 * below indirect blocks, nothing to be done.
522	 */
523	if (!topdir || (punch_hole && (limit <= SHMEM_NR_DIRECT)))
524		goto done2;
525
526	BUG_ON(limit <= SHMEM_NR_DIRECT);
527	limit -= SHMEM_NR_DIRECT;
528	idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
529	offset = idx % ENTRIES_PER_PAGE;
530	idx -= offset;
531
532	dir = shmem_dir_map(topdir);
533	stage = ENTRIES_PER_PAGEPAGE/2;
534	if (idx < ENTRIES_PER_PAGEPAGE/2) {
535		middir = topdir;
536		diroff = idx/ENTRIES_PER_PAGE;
537	} else {
538		dir += ENTRIES_PER_PAGE/2;
539		dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
540		while (stage <= idx)
541			stage += ENTRIES_PER_PAGEPAGE;
542		middir = *dir;
543		if (*dir) {
544			diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
545				ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
546			if (!diroff && !offset) {
547				*dir = NULL;
548				nr_pages_to_free++;
549				list_add(&middir->lru, &pages_to_free);
550			}
551			shmem_dir_unmap(dir);
552			dir = shmem_dir_map(middir);
553		} else {
554			diroff = 0;
555			offset = 0;
556			idx = stage;
557		}
558	}
559
560	for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) {
561		if (unlikely(idx == stage)) {
562			shmem_dir_unmap(dir);
563			dir = shmem_dir_map(topdir) +
564			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
565			while (!*dir) {
566				dir++;
567				idx += ENTRIES_PER_PAGEPAGE;
568				if (idx >= limit)
569					goto done1;
570			}
571			stage = idx + ENTRIES_PER_PAGEPAGE;
572			middir = *dir;
573			*dir = NULL;
574			nr_pages_to_free++;
575			list_add(&middir->lru, &pages_to_free);
576			shmem_dir_unmap(dir);
577			cond_resched();
578			dir = shmem_dir_map(middir);
579			diroff = 0;
580		}
581		subdir = dir[diroff];
582		if (subdir && page_private(subdir)) {
583			size = limit - idx;
584			if (size > ENTRIES_PER_PAGE)
585				size = ENTRIES_PER_PAGE;
586			freed = shmem_map_and_free_swp(subdir,
587						offset, size, &dir);
588			if (!dir)
589				dir = shmem_dir_map(middir);
590			nr_swaps_freed += freed;
591			if (offset)
592				spin_lock(&info->lock);
593			set_page_private(subdir, page_private(subdir) - freed);
594			if (offset)
595				spin_unlock(&info->lock);
596			if (!punch_hole)
597				BUG_ON(page_private(subdir) > offset);
598		}
599		if (offset)
600			offset = 0;
601		else if (subdir && !page_private(subdir)) {
602			dir[diroff] = NULL;
603			nr_pages_to_free++;
604			list_add(&subdir->lru, &pages_to_free);
605		}
606	}
607done1:
608	shmem_dir_unmap(dir);
609done2:
610	if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
611		/*
612		 * Call truncate_inode_pages again: racing shmem_unuse_inode
613		 * may have swizzled a page in from swap since vmtruncate or
614		 * generic_delete_inode did it, before we lowered next_index.
615		 * Also, though shmem_getpage checks i_size before adding to
616		 * cache, no recheck after: so fix the narrow window there too.
617		 */
618		truncate_inode_pages_range(inode->i_mapping, start, end);
619	}
620
621	spin_lock(&info->lock);
622	info->flags &= ~SHMEM_TRUNCATE;
623	info->swapped -= nr_swaps_freed;
624	if (nr_pages_to_free)
625		shmem_free_blocks(inode, nr_pages_to_free);
626	shmem_recalc_inode(inode);
627	spin_unlock(&info->lock);
628
629	/*
630	 * Empty swap vector directory pages to be freed?
631	 */
632	if (!list_empty(&pages_to_free)) {
633		pages_to_free.prev->next = NULL;
634		shmem_free_pages(pages_to_free.next);
635	}
636}
637
638static void shmem_truncate(struct inode *inode)
639{
640	shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
641}
642
643static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
644{
645	struct inode *inode = dentry->d_inode;
646	struct page *page = NULL;
647	int error;
648
649	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
650		if (attr->ia_size < inode->i_size) {
651			/*
652			 * If truncating down to a partial page, then
653			 * if that page is already allocated, hold it
654			 * in memory until the truncation is over, so
655			 * truncate_partial_page cannnot miss it were
656			 * it assigned to swap.
657			 */
658			if (attr->ia_size & (PAGE_CACHE_SIZE-1)) {
659				(void) shmem_getpage(inode,
660					attr->ia_size>>PAGE_CACHE_SHIFT,
661						&page, SGP_READ, NULL);
662			}
663			/*
664			 * Reset SHMEM_PAGEIN flag so that shmem_truncate can
665			 * detect if any pages might have been added to cache
666			 * after truncate_inode_pages.  But we needn't bother
667			 * if it's being fully truncated to zero-length: the
668			 * nrpages check is efficient enough in that case.
669			 */
670			if (attr->ia_size) {
671				struct shmem_inode_info *info = SHMEM_I(inode);
672				spin_lock(&info->lock);
673				info->flags &= ~SHMEM_PAGEIN;
674				spin_unlock(&info->lock);
675			}
676		}
677	}
678
679	error = inode_change_ok(inode, attr);
680	if (!error)
681		error = inode_setattr(inode, attr);
682#ifdef CONFIG_TMPFS_POSIX_ACL
683	if (!error && (attr->ia_valid & ATTR_MODE))
684		error = generic_acl_chmod(inode, &shmem_acl_ops);
685#endif
686	if (page)
687		page_cache_release(page);
688	return error;
689}
690
691static void shmem_delete_inode(struct inode *inode)
692{
693	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
694	struct shmem_inode_info *info = SHMEM_I(inode);
695
696	if (inode->i_op->truncate == shmem_truncate) {
697		truncate_inode_pages(inode->i_mapping, 0);
698		shmem_unacct_size(info->flags, inode->i_size);
699		inode->i_size = 0;
700		shmem_truncate(inode);
701		if (!list_empty(&info->swaplist)) {
702			spin_lock(&shmem_swaplist_lock);
703			list_del_init(&info->swaplist);
704			spin_unlock(&shmem_swaplist_lock);
705		}
706	}
707	BUG_ON(inode->i_blocks);
708	if (sbinfo->max_inodes) {
709		spin_lock(&sbinfo->stat_lock);
710		sbinfo->free_inodes++;
711		spin_unlock(&sbinfo->stat_lock);
712	}
713	clear_inode(inode);
714}
715
716static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
717{
718	swp_entry_t *ptr;
719
720	for (ptr = dir; ptr < edir; ptr++) {
721		if (ptr->val == entry.val)
722			return ptr - dir;
723	}
724	return -1;
725}
726
727static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
728{
729	struct inode *inode;
730	unsigned long idx;
731	unsigned long size;
732	unsigned long limit;
733	unsigned long stage;
734	struct page **dir;
735	struct page *subdir;
736	swp_entry_t *ptr;
737	int offset;
738
739	idx = 0;
740	ptr = info->i_direct;
741	spin_lock(&info->lock);
742	limit = info->next_index;
743	size = limit;
744	if (size > SHMEM_NR_DIRECT)
745		size = SHMEM_NR_DIRECT;
746	offset = shmem_find_swp(entry, ptr, ptr+size);
747	if (offset >= 0) {
748		shmem_swp_balance_unmap();
749		goto found;
750	}
751	if (!info->i_indirect)
752		goto lost2;
753
754	dir = shmem_dir_map(info->i_indirect);
755	stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
756
757	for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
758		if (unlikely(idx == stage)) {
759			shmem_dir_unmap(dir-1);
760			dir = shmem_dir_map(info->i_indirect) +
761			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
762			while (!*dir) {
763				dir++;
764				idx += ENTRIES_PER_PAGEPAGE;
765				if (idx >= limit)
766					goto lost1;
767			}
768			stage = idx + ENTRIES_PER_PAGEPAGE;
769			subdir = *dir;
770			shmem_dir_unmap(dir);
771			dir = shmem_dir_map(subdir);
772		}
773		subdir = *dir;
774		if (subdir && page_private(subdir)) {
775			ptr = shmem_swp_map(subdir);
776			size = limit - idx;
777			if (size > ENTRIES_PER_PAGE)
778				size = ENTRIES_PER_PAGE;
779			offset = shmem_find_swp(entry, ptr, ptr+size);
780			if (offset >= 0) {
781				shmem_dir_unmap(dir);
782				goto found;
783			}
784			shmem_swp_unmap(ptr);
785		}
786	}
787lost1:
788	shmem_dir_unmap(dir-1);
789lost2:
790	spin_unlock(&info->lock);
791	return 0;
792found:
793	idx += offset;
794	inode = &info->vfs_inode;
795	if (move_from_swap_cache(page, idx, inode->i_mapping) == 0) {
796		info->flags |= SHMEM_PAGEIN;
797		shmem_swp_set(info, ptr + offset, 0);
798	}
799	shmem_swp_unmap(ptr);
800	spin_unlock(&info->lock);
801	/*
802	 * Decrement swap count even when the entry is left behind:
803	 * try_to_unuse will skip over mms, then reincrement count.
804	 */
805	swap_free(entry);
806	return 1;
807}
808
809/*
810 * shmem_unuse() search for an eventually swapped out shmem page.
811 */
812int shmem_unuse(swp_entry_t entry, struct page *page)
813{
814	struct list_head *p, *next;
815	struct shmem_inode_info *info;
816	int found = 0;
817
818	spin_lock(&shmem_swaplist_lock);
819	list_for_each_safe(p, next, &shmem_swaplist) {
820		info = list_entry(p, struct shmem_inode_info, swaplist);
821		if (!info->swapped)
822			list_del_init(&info->swaplist);
823		else if (shmem_unuse_inode(info, entry, page)) {
824			/* move head to start search for next from here */
825			list_move_tail(&shmem_swaplist, &info->swaplist);
826			found = 1;
827			break;
828		}
829	}
830	spin_unlock(&shmem_swaplist_lock);
831	return found;
832}
833
834/*
835 * Move the page from the page cache to the swap cache.
836 */
837static int shmem_writepage(struct page *page, struct writeback_control *wbc)
838{
839	struct shmem_inode_info *info;
840	swp_entry_t *entry, swap;
841	struct address_space *mapping;
842	unsigned long index;
843	struct inode *inode;
844
845	BUG_ON(!PageLocked(page));
846	BUG_ON(page_mapped(page));
847
848	mapping = page->mapping;
849	index = page->index;
850	inode = mapping->host;
851	info = SHMEM_I(inode);
852	if (info->flags & VM_LOCKED)
853		goto redirty;
854	swap = get_swap_page();
855	if (!swap.val)
856		goto redirty;
857
858	spin_lock(&info->lock);
859	shmem_recalc_inode(inode);
860	if (index >= info->next_index) {
861		BUG_ON(!(info->flags & SHMEM_TRUNCATE));
862		goto unlock;
863	}
864	entry = shmem_swp_entry(info, index, NULL);
865	BUG_ON(!entry);
866	BUG_ON(entry->val);
867
868	if (move_to_swap_cache(page, swap) == 0) {
869		shmem_swp_set(info, entry, swap.val);
870		shmem_swp_unmap(entry);
871		spin_unlock(&info->lock);
872		if (list_empty(&info->swaplist)) {
873			spin_lock(&shmem_swaplist_lock);
874			/* move instead of add in case we're racing */
875			list_move_tail(&info->swaplist, &shmem_swaplist);
876			spin_unlock(&shmem_swaplist_lock);
877		}
878		unlock_page(page);
879		return 0;
880	}
881
882	shmem_swp_unmap(entry);
883unlock:
884	spin_unlock(&info->lock);
885	swap_free(swap);
886redirty:
887	set_page_dirty(page);
888	return AOP_WRITEPAGE_ACTIVATE;	/* Return with the page locked */
889}
890
891#ifdef CONFIG_NUMA
892static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
893{
894	char *nodelist = strchr(value, ':');
895	int err = 1;
896
897	if (nodelist) {
898		/* NUL-terminate policy string */
899		*nodelist++ = '\0';
900		if (nodelist_parse(nodelist, *policy_nodes))
901			goto out;
902	}
903	if (!strcmp(value, "default")) {
904		*policy = MPOL_DEFAULT;
905		/* Don't allow a nodelist */
906		if (!nodelist)
907			err = 0;
908	} else if (!strcmp(value, "prefer")) {
909		*policy = MPOL_PREFERRED;
910		/* Insist on a nodelist of one node only */
911		if (nodelist) {
912			char *rest = nodelist;
913			while (isdigit(*rest))
914				rest++;
915			if (!*rest)
916				err = 0;
917		}
918	} else if (!strcmp(value, "bind")) {
919		*policy = MPOL_BIND;
920		/* Insist on a nodelist */
921		if (nodelist)
922			err = 0;
923	} else if (!strcmp(value, "interleave")) {
924		*policy = MPOL_INTERLEAVE;
925		/* Default to nodes online if no nodelist */
926		if (!nodelist)
927			*policy_nodes = node_online_map;
928		err = 0;
929	}
930out:
931	/* Restore string for error message */
932	if (nodelist)
933		*--nodelist = ':';
934	return err;
935}
936
937static struct page *shmem_swapin_async(struct shared_policy *p,
938				       swp_entry_t entry, unsigned long idx)
939{
940	struct page *page;
941	struct vm_area_struct pvma;
942
943	/* Create a pseudo vma that just contains the policy */
944	memset(&pvma, 0, sizeof(struct vm_area_struct));
945	pvma.vm_end = PAGE_SIZE;
946	pvma.vm_pgoff = idx;
947	pvma.vm_policy = mpol_shared_policy_lookup(p, idx);
948	page = read_swap_cache_async(entry, &pvma, 0);
949	mpol_free(pvma.vm_policy);
950	return page;
951}
952
953struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry,
954			  unsigned long idx)
955{
956	struct shared_policy *p = &info->policy;
957	int i, num;
958	struct page *page;
959	unsigned long offset;
960
961	num = valid_swaphandles(entry, &offset);
962	for (i = 0; i < num; offset++, i++) {
963		page = shmem_swapin_async(p,
964				swp_entry(swp_type(entry), offset), idx);
965		if (!page)
966			break;
967		page_cache_release(page);
968	}
969	lru_add_drain();	/* Push any new pages onto the LRU now */
970	return shmem_swapin_async(p, entry, idx);
971}
972
973static struct page *
974shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
975		 unsigned long idx)
976{
977	struct vm_area_struct pvma;
978	struct page *page;
979
980	memset(&pvma, 0, sizeof(struct vm_area_struct));
981	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
982	pvma.vm_pgoff = idx;
983	pvma.vm_end = PAGE_SIZE;
984	page = alloc_page_vma(gfp | __GFP_ZERO, &pvma, 0);
985	mpol_free(pvma.vm_policy);
986	return page;
987}
988#else
989static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
990{
991	return 1;
992}
993
994static inline struct page *
995shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
996{
997	swapin_readahead(entry, 0, NULL);
998	return read_swap_cache_async(entry, NULL, 0);
999}
1000
1001static inline struct page *
1002shmem_alloc_page(gfp_t gfp,struct shmem_inode_info *info, unsigned long idx)
1003{
1004	return alloc_page(gfp | __GFP_ZERO);
1005}
1006#endif
1007
1008/*
1009 * shmem_getpage - either get the page from swap or allocate a new one
1010 *
1011 * If we allocate a new one we do not mark it dirty. That's up to the
1012 * vm. If we swap it in we mark it dirty since we also free the swap
1013 * entry since a page cannot live in both the swap and page cache
1014 */
1015static int shmem_getpage(struct inode *inode, unsigned long idx,
1016			struct page **pagep, enum sgp_type sgp, int *type)
1017{
1018	struct address_space *mapping = inode->i_mapping;
1019	struct shmem_inode_info *info = SHMEM_I(inode);
1020	struct shmem_sb_info *sbinfo;
1021	struct page *filepage = *pagep;
1022	struct page *swappage;
1023	swp_entry_t *entry;
1024	swp_entry_t swap;
1025	int error;
1026
1027	if (idx >= SHMEM_MAX_INDEX)
1028		return -EFBIG;
1029	/*
1030	 * Normally, filepage is NULL on entry, and either found
1031	 * uptodate immediately, or allocated and zeroed, or read
1032	 * in under swappage, which is then assigned to filepage.
1033	 * But shmem_prepare_write passes in a locked filepage,
1034	 * which may be found not uptodate by other callers too,
1035	 * and may need to be copied from the swappage read in.
1036	 */
1037repeat:
1038	if (!filepage)
1039		filepage = find_lock_page(mapping, idx);
1040	if (filepage && PageUptodate(filepage))
1041		goto done;
1042	error = 0;
1043	if (sgp == SGP_QUICK)
1044		goto failed;
1045
1046	spin_lock(&info->lock);
1047	shmem_recalc_inode(inode);
1048	entry = shmem_swp_alloc(info, idx, sgp);
1049	if (IS_ERR(entry)) {
1050		spin_unlock(&info->lock);
1051		error = PTR_ERR(entry);
1052		goto failed;
1053	}
1054	swap = *entry;
1055
1056	if (swap.val) {
1057		/* Look it up and read it in.. */
1058		swappage = lookup_swap_cache(swap);
1059		if (!swappage) {
1060			shmem_swp_unmap(entry);
1061			/* here we actually do the io */
1062			if (type && *type == VM_FAULT_MINOR) {
1063				__count_vm_event(PGMAJFAULT);
1064				*type = VM_FAULT_MAJOR;
1065			}
1066			spin_unlock(&info->lock);
1067			swappage = shmem_swapin(info, swap, idx);
1068			if (!swappage) {
1069				spin_lock(&info->lock);
1070				entry = shmem_swp_alloc(info, idx, sgp);
1071				if (IS_ERR(entry))
1072					error = PTR_ERR(entry);
1073				else {
1074					if (entry->val == swap.val)
1075						error = -ENOMEM;
1076					shmem_swp_unmap(entry);
1077				}
1078				spin_unlock(&info->lock);
1079				if (error)
1080					goto failed;
1081				goto repeat;
1082			}
1083			wait_on_page_locked(swappage);
1084			page_cache_release(swappage);
1085			goto repeat;
1086		}
1087
1088		/* We have to do this with page locked to prevent races */
1089		if (TestSetPageLocked(swappage)) {
1090			shmem_swp_unmap(entry);
1091			spin_unlock(&info->lock);
1092			wait_on_page_locked(swappage);
1093			page_cache_release(swappage);
1094			goto repeat;
1095		}
1096		if (PageWriteback(swappage)) {
1097			shmem_swp_unmap(entry);
1098			spin_unlock(&info->lock);
1099			wait_on_page_writeback(swappage);
1100			unlock_page(swappage);
1101			page_cache_release(swappage);
1102			goto repeat;
1103		}
1104		if (!PageUptodate(swappage)) {
1105			shmem_swp_unmap(entry);
1106			spin_unlock(&info->lock);
1107			unlock_page(swappage);
1108			page_cache_release(swappage);
1109			error = -EIO;
1110			goto failed;
1111		}
1112
1113		if (filepage) {
1114			shmem_swp_set(info, entry, 0);
1115			shmem_swp_unmap(entry);
1116			delete_from_swap_cache(swappage);
1117			spin_unlock(&info->lock);
1118			copy_highpage(filepage, swappage);
1119			unlock_page(swappage);
1120			page_cache_release(swappage);
1121			flush_dcache_page(filepage);
1122			SetPageUptodate(filepage);
1123			set_page_dirty(filepage);
1124			swap_free(swap);
1125		} else if (!(error = move_from_swap_cache(
1126				swappage, idx, mapping))) {
1127			info->flags |= SHMEM_PAGEIN;
1128			shmem_swp_set(info, entry, 0);
1129			shmem_swp_unmap(entry);
1130			spin_unlock(&info->lock);
1131			filepage = swappage;
1132			swap_free(swap);
1133		} else {
1134			shmem_swp_unmap(entry);
1135			spin_unlock(&info->lock);
1136			unlock_page(swappage);
1137			page_cache_release(swappage);
1138			if (error == -ENOMEM) {
1139				/* let kswapd refresh zone for GFP_ATOMICs */
1140				congestion_wait(WRITE, HZ/50);
1141			}
1142			goto repeat;
1143		}
1144	} else if (sgp == SGP_READ && !filepage) {
1145		shmem_swp_unmap(entry);
1146		filepage = find_get_page(mapping, idx);
1147		if (filepage &&
1148		    (!PageUptodate(filepage) || TestSetPageLocked(filepage))) {
1149			spin_unlock(&info->lock);
1150			wait_on_page_locked(filepage);
1151			page_cache_release(filepage);
1152			filepage = NULL;
1153			goto repeat;
1154		}
1155		spin_unlock(&info->lock);
1156	} else {
1157		shmem_swp_unmap(entry);
1158		sbinfo = SHMEM_SB(inode->i_sb);
1159		if (sbinfo->max_blocks) {
1160			spin_lock(&sbinfo->stat_lock);
1161			if (sbinfo->free_blocks == 0 ||
1162			    shmem_acct_block(info->flags)) {
1163				spin_unlock(&sbinfo->stat_lock);
1164				spin_unlock(&info->lock);
1165				error = -ENOSPC;
1166				goto failed;
1167			}
1168			sbinfo->free_blocks--;
1169			inode->i_blocks += BLOCKS_PER_PAGE;
1170			spin_unlock(&sbinfo->stat_lock);
1171		} else if (shmem_acct_block(info->flags)) {
1172			spin_unlock(&info->lock);
1173			error = -ENOSPC;
1174			goto failed;
1175		}
1176
1177		if (!filepage) {
1178			spin_unlock(&info->lock);
1179			filepage = shmem_alloc_page(mapping_gfp_mask(mapping),
1180						    info,
1181						    idx);
1182			if (!filepage) {
1183				shmem_unacct_blocks(info->flags, 1);
1184				shmem_free_blocks(inode, 1);
1185				error = -ENOMEM;
1186				goto failed;
1187			}
1188
1189			spin_lock(&info->lock);
1190			entry = shmem_swp_alloc(info, idx, sgp);
1191			if (IS_ERR(entry))
1192				error = PTR_ERR(entry);
1193			else {
1194				swap = *entry;
1195				shmem_swp_unmap(entry);
1196			}
1197			if (error || swap.val || 0 != add_to_page_cache_lru(
1198					filepage, mapping, idx, GFP_ATOMIC)) {
1199				spin_unlock(&info->lock);
1200				page_cache_release(filepage);
1201				shmem_unacct_blocks(info->flags, 1);
1202				shmem_free_blocks(inode, 1);
1203				filepage = NULL;
1204				if (error)
1205					goto failed;
1206				goto repeat;
1207			}
1208			info->flags |= SHMEM_PAGEIN;
1209		}
1210
1211		info->alloced++;
1212		spin_unlock(&info->lock);
1213		flush_dcache_page(filepage);
1214		SetPageUptodate(filepage);
1215	}
1216done:
1217	if (*pagep != filepage) {
1218		unlock_page(filepage);
1219		*pagep = filepage;
1220	}
1221	return 0;
1222
1223failed:
1224	if (*pagep != filepage) {
1225		unlock_page(filepage);
1226		page_cache_release(filepage);
1227	}
1228	return error;
1229}
1230
1231struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type)
1232{
1233	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1234	struct page *page = NULL;
1235	unsigned long idx;
1236	int error;
1237
1238	idx = (address - vma->vm_start) >> PAGE_SHIFT;
1239	idx += vma->vm_pgoff;
1240	idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
1241	if (((loff_t) idx << PAGE_CACHE_SHIFT) >= i_size_read(inode))
1242		return NOPAGE_SIGBUS;
1243
1244	error = shmem_getpage(inode, idx, &page, SGP_CACHE, type);
1245	if (error)
1246		return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS;
1247
1248	mark_page_accessed(page);
1249	return page;
1250}
1251
1252static int shmem_populate(struct vm_area_struct *vma,
1253	unsigned long addr, unsigned long len,
1254	pgprot_t prot, unsigned long pgoff, int nonblock)
1255{
1256	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1257	struct mm_struct *mm = vma->vm_mm;
1258	enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE;
1259	unsigned long size;
1260
1261	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
1262	if (pgoff >= size || pgoff + (len >> PAGE_SHIFT) > size)
1263		return -EINVAL;
1264
1265	while ((long) len > 0) {
1266		struct page *page = NULL;
1267		int err;
1268		/*
1269		 * Will need changing if PAGE_CACHE_SIZE != PAGE_SIZE
1270		 */
1271		err = shmem_getpage(inode, pgoff, &page, sgp, NULL);
1272		if (err)
1273			return err;
1274		/* Page may still be null, but only if nonblock was set. */
1275		if (page) {
1276			mark_page_accessed(page);
1277			err = install_page(mm, vma, addr, page, prot);
1278			if (err) {
1279				page_cache_release(page);
1280				return err;
1281			}
1282		} else if (vma->vm_flags & VM_NONLINEAR) {
1283			/* No page was found just because we can't read it in
1284			 * now (being here implies nonblock != 0), but the page
1285			 * may exist, so set the PTE to fault it in later. */
1286    			err = install_file_pte(mm, vma, addr, pgoff, prot);
1287			if (err)
1288	    			return err;
1289		}
1290
1291		len -= PAGE_SIZE;
1292		addr += PAGE_SIZE;
1293		pgoff++;
1294	}
1295	return 0;
1296}
1297
1298#ifdef CONFIG_NUMA
1299int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
1300{
1301	struct inode *i = vma->vm_file->f_path.dentry->d_inode;
1302	return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
1303}
1304
1305struct mempolicy *
1306shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
1307{
1308	struct inode *i = vma->vm_file->f_path.dentry->d_inode;
1309	unsigned long idx;
1310
1311	idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1312	return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
1313}
1314#endif
1315
1316int shmem_lock(struct file *file, int lock, struct user_struct *user)
1317{
1318	struct inode *inode = file->f_path.dentry->d_inode;
1319	struct shmem_inode_info *info = SHMEM_I(inode);
1320	int retval = -ENOMEM;
1321
1322	spin_lock(&info->lock);
1323	if (lock && !(info->flags & VM_LOCKED)) {
1324		if (!user_shm_lock(inode->i_size, user))
1325			goto out_nomem;
1326		info->flags |= VM_LOCKED;
1327	}
1328	if (!lock && (info->flags & VM_LOCKED) && user) {
1329		user_shm_unlock(inode->i_size, user);
1330		info->flags &= ~VM_LOCKED;
1331	}
1332	retval = 0;
1333out_nomem:
1334	spin_unlock(&info->lock);
1335	return retval;
1336}
1337
1338int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1339{
1340	file_accessed(file);
1341	vma->vm_ops = &shmem_vm_ops;
1342	return 0;
1343}
1344
1345static struct inode *
1346shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1347{
1348	struct inode *inode;
1349	struct shmem_inode_info *info;
1350	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1351
1352	if (sbinfo->max_inodes) {
1353		spin_lock(&sbinfo->stat_lock);
1354		if (!sbinfo->free_inodes) {
1355			spin_unlock(&sbinfo->stat_lock);
1356			return NULL;
1357		}
1358		sbinfo->free_inodes--;
1359		spin_unlock(&sbinfo->stat_lock);
1360	}
1361
1362	inode = new_inode(sb);
1363	if (inode) {
1364		inode->i_mode = mode;
1365		inode->i_uid = current->fsuid;
1366		inode->i_gid = current->fsgid;
1367		inode->i_blocks = 0;
1368		inode->i_mapping->a_ops = &shmem_aops;
1369		inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1370		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1371		inode->i_generation = get_seconds();
1372		info = SHMEM_I(inode);
1373		memset(info, 0, (char *)inode - (char *)info);
1374		spin_lock_init(&info->lock);
1375		INIT_LIST_HEAD(&info->swaplist);
1376
1377		switch (mode & S_IFMT) {
1378		default:
1379			inode->i_op = &shmem_special_inode_operations;
1380			init_special_inode(inode, mode, dev);
1381			break;
1382		case S_IFREG:
1383			inode->i_op = &shmem_inode_operations;
1384			inode->i_fop = &shmem_file_operations;
1385			mpol_shared_policy_init(&info->policy, sbinfo->policy,
1386							&sbinfo->policy_nodes);
1387			break;
1388		case S_IFDIR:
1389			inc_nlink(inode);
1390			/* Some things misbehave if size == 0 on a directory */
1391			inode->i_size = 2 * BOGO_DIRENT_SIZE;
1392			inode->i_op = &shmem_dir_inode_operations;
1393			inode->i_fop = &simple_dir_operations;
1394			break;
1395		case S_IFLNK:
1396			/*
1397			 * Must not load anything in the rbtree,
1398			 * mpol_free_shared_policy will not be called.
1399			 */
1400			mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
1401						NULL);
1402			break;
1403		}
1404	} else if (sbinfo->max_inodes) {
1405		spin_lock(&sbinfo->stat_lock);
1406		sbinfo->free_inodes++;
1407		spin_unlock(&sbinfo->stat_lock);
1408	}
1409	return inode;
1410}
1411
1412#ifdef CONFIG_TMPFS
1413static struct inode_operations shmem_symlink_inode_operations;
1414static struct inode_operations shmem_symlink_inline_operations;
1415
1416/*
1417 * Normally tmpfs makes no use of shmem_prepare_write, but it
1418 * lets a tmpfs file be used read-write below the loop driver.
1419 */
1420static int
1421shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
1422{
1423	struct inode *inode = page->mapping->host;
1424	return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL);
1425}
1426
1427static ssize_t
1428shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
1429{
1430	struct inode	*inode = file->f_path.dentry->d_inode;
1431	loff_t		pos;
1432	unsigned long	written;
1433	ssize_t		err;
1434
1435	if ((ssize_t) count < 0)
1436		return -EINVAL;
1437
1438	if (!access_ok(VERIFY_READ, buf, count))
1439		return -EFAULT;
1440
1441	mutex_lock(&inode->i_mutex);
1442
1443	pos = *ppos;
1444	written = 0;
1445
1446	err = generic_write_checks(file, &pos, &count, 0);
1447	if (err || !count)
1448		goto out;
1449
1450	err = remove_suid(file->f_path.dentry);
1451	if (err)
1452		goto out;
1453
1454	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1455
1456	do {
1457		struct page *page = NULL;
1458		unsigned long bytes, index, offset;
1459		char *kaddr;
1460		int left;
1461
1462		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
1463		index = pos >> PAGE_CACHE_SHIFT;
1464		bytes = PAGE_CACHE_SIZE - offset;
1465		if (bytes > count)
1466			bytes = count;
1467
1468		/*
1469		 * We don't hold page lock across copy from user -
1470		 * what would it guard against? - so no deadlock here.
1471		 * But it still may be a good idea to prefault below.
1472		 */
1473
1474		err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL);
1475		if (err)
1476			break;
1477
1478		left = bytes;
1479		if (PageHighMem(page)) {
1480			volatile unsigned char dummy;
1481			__get_user(dummy, buf);
1482			__get_user(dummy, buf + bytes - 1);
1483
1484			kaddr = kmap_atomic(page, KM_USER0);
1485			left = __copy_from_user_inatomic(kaddr + offset,
1486							buf, bytes);
1487			kunmap_atomic(kaddr, KM_USER0);
1488		}
1489		if (left) {
1490			kaddr = kmap(page);
1491			left = __copy_from_user(kaddr + offset, buf, bytes);
1492			kunmap(page);
1493		}
1494
1495		written += bytes;
1496		count -= bytes;
1497		pos += bytes;
1498		buf += bytes;
1499		if (pos > inode->i_size)
1500			i_size_write(inode, pos);
1501
1502		flush_dcache_page(page);
1503		set_page_dirty(page);
1504		mark_page_accessed(page);
1505		page_cache_release(page);
1506
1507		if (left) {
1508			pos -= left;
1509			written -= left;
1510			err = -EFAULT;
1511			break;
1512		}
1513
1514		/*
1515		 * Our dirty pages are not counted in nr_dirty,
1516		 * and we do not attempt to balance dirty pages.
1517		 */
1518
1519		cond_resched();
1520	} while (count);
1521
1522	*ppos = pos;
1523	if (written)
1524		err = written;
1525out:
1526	mutex_unlock(&inode->i_mutex);
1527	return err;
1528}
1529
1530static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
1531{
1532	struct inode *inode = filp->f_path.dentry->d_inode;
1533	struct address_space *mapping = inode->i_mapping;
1534	unsigned long index, offset;
1535
1536	index = *ppos >> PAGE_CACHE_SHIFT;
1537	offset = *ppos & ~PAGE_CACHE_MASK;
1538
1539	for (;;) {
1540		struct page *page = NULL;
1541		unsigned long end_index, nr, ret;
1542		loff_t i_size = i_size_read(inode);
1543
1544		end_index = i_size >> PAGE_CACHE_SHIFT;
1545		if (index > end_index)
1546			break;
1547		if (index == end_index) {
1548			nr = i_size & ~PAGE_CACHE_MASK;
1549			if (nr <= offset)
1550				break;
1551		}
1552
1553		desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL);
1554		if (desc->error) {
1555			if (desc->error == -EINVAL)
1556				desc->error = 0;
1557			break;
1558		}
1559
1560		/*
1561		 * We must evaluate after, since reads (unlike writes)
1562		 * are called without i_mutex protection against truncate
1563		 */
1564		nr = PAGE_CACHE_SIZE;
1565		i_size = i_size_read(inode);
1566		end_index = i_size >> PAGE_CACHE_SHIFT;
1567		if (index == end_index) {
1568			nr = i_size & ~PAGE_CACHE_MASK;
1569			if (nr <= offset) {
1570				if (page)
1571					page_cache_release(page);
1572				break;
1573			}
1574		}
1575		nr -= offset;
1576
1577		if (page) {
1578			/*
1579			 * If users can be writing to this page using arbitrary
1580			 * virtual addresses, take care about potential aliasing
1581			 * before reading the page on the kernel side.
1582			 */
1583			if (mapping_writably_mapped(mapping))
1584				flush_dcache_page(page);
1585			/*
1586			 * Mark the page accessed if we read the beginning.
1587			 */
1588			if (!offset)
1589				mark_page_accessed(page);
1590		} else {
1591			page = ZERO_PAGE(0);
1592			page_cache_get(page);
1593		}
1594
1595		/*
1596		 * Ok, we have the page, and it's up-to-date, so
1597		 * now we can copy it to user space...
1598		 *
1599		 * The actor routine returns how many bytes were actually used..
1600		 * NOTE! This may not be the same as how much of a user buffer
1601		 * we filled up (we may be padding etc), so we can only update
1602		 * "pos" here (the actor routine has to update the user buffer
1603		 * pointers and the remaining count).
1604		 */
1605		ret = actor(desc, page, offset, nr);
1606		offset += ret;
1607		index += offset >> PAGE_CACHE_SHIFT;
1608		offset &= ~PAGE_CACHE_MASK;
1609
1610		page_cache_release(page);
1611		if (ret != nr || !desc->count)
1612			break;
1613
1614		cond_resched();
1615	}
1616
1617	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1618	file_accessed(filp);
1619}
1620
1621static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
1622{
1623	read_descriptor_t desc;
1624
1625	if ((ssize_t) count < 0)
1626		return -EINVAL;
1627	if (!access_ok(VERIFY_WRITE, buf, count))
1628		return -EFAULT;
1629	if (!count)
1630		return 0;
1631
1632	desc.written = 0;
1633	desc.count = count;
1634	desc.arg.buf = buf;
1635	desc.error = 0;
1636
1637	do_shmem_file_read(filp, ppos, &desc, file_read_actor);
1638	if (desc.written)
1639		return desc.written;
1640	return desc.error;
1641}
1642
1643static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
1644			 size_t count, read_actor_t actor, void *target)
1645{
1646	read_descriptor_t desc;
1647
1648	if (!count)
1649		return 0;
1650
1651	desc.written = 0;
1652	desc.count = count;
1653	desc.arg.data = target;
1654	desc.error = 0;
1655
1656	do_shmem_file_read(in_file, ppos, &desc, actor);
1657	if (desc.written)
1658		return desc.written;
1659	return desc.error;
1660}
1661
1662static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1663{
1664	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
1665
1666	buf->f_type = TMPFS_MAGIC;
1667	buf->f_bsize = PAGE_CACHE_SIZE;
1668	buf->f_namelen = NAME_MAX;
1669	spin_lock(&sbinfo->stat_lock);
1670	if (sbinfo->max_blocks) {
1671		buf->f_blocks = sbinfo->max_blocks;
1672		buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
1673	}
1674	if (sbinfo->max_inodes) {
1675		buf->f_files = sbinfo->max_inodes;
1676		buf->f_ffree = sbinfo->free_inodes;
1677	}
1678	/* else leave those fields 0 like simple_statfs */
1679	spin_unlock(&sbinfo->stat_lock);
1680	return 0;
1681}
1682
1683/*
1684 * File creation. Allocate an inode, and we're done..
1685 */
1686static int
1687shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1688{
1689	struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev);
1690	int error = -ENOSPC;
1691
1692	if (inode) {
1693		error = security_inode_init_security(inode, dir, NULL, NULL,
1694						     NULL);
1695		if (error) {
1696			if (error != -EOPNOTSUPP) {
1697				iput(inode);
1698				return error;
1699			}
1700		}
1701		error = shmem_acl_init(inode, dir);
1702		if (error) {
1703			iput(inode);
1704			return error;
1705		}
1706		if (dir->i_mode & S_ISGID) {
1707			inode->i_gid = dir->i_gid;
1708			if (S_ISDIR(mode))
1709				inode->i_mode |= S_ISGID;
1710		}
1711		dir->i_size += BOGO_DIRENT_SIZE;
1712		dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1713		d_instantiate(dentry, inode);
1714		dget(dentry); /* Extra count - pin the dentry in core */
1715	}
1716	return error;
1717}
1718
1719static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1720{
1721	int error;
1722
1723	if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
1724		return error;
1725	inc_nlink(dir);
1726	return 0;
1727}
1728
1729static int shmem_create(struct inode *dir, struct dentry *dentry, int mode,
1730		struct nameidata *nd)
1731{
1732	return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
1733}
1734
1735/*
1736 * Link a file..
1737 */
1738static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1739{
1740	struct inode *inode = old_dentry->d_inode;
1741	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1742
1743	/*
1744	 * No ordinary (disk based) filesystem counts links as inodes;
1745	 * but each new link needs a new dentry, pinning lowmem, and
1746	 * tmpfs dentries cannot be pruned until they are unlinked.
1747	 */
1748	if (sbinfo->max_inodes) {
1749		spin_lock(&sbinfo->stat_lock);
1750		if (!sbinfo->free_inodes) {
1751			spin_unlock(&sbinfo->stat_lock);
1752			return -ENOSPC;
1753		}
1754		sbinfo->free_inodes--;
1755		spin_unlock(&sbinfo->stat_lock);
1756	}
1757
1758	dir->i_size += BOGO_DIRENT_SIZE;
1759	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1760	inc_nlink(inode);
1761	atomic_inc(&inode->i_count);	/* New dentry reference */
1762	dget(dentry);		/* Extra pinning count for the created dentry */
1763	d_instantiate(dentry, inode);
1764	return 0;
1765}
1766
1767static int shmem_unlink(struct inode *dir, struct dentry *dentry)
1768{
1769	struct inode *inode = dentry->d_inode;
1770
1771	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) {
1772		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1773		if (sbinfo->max_inodes) {
1774			spin_lock(&sbinfo->stat_lock);
1775			sbinfo->free_inodes++;
1776			spin_unlock(&sbinfo->stat_lock);
1777		}
1778	}
1779
1780	dir->i_size -= BOGO_DIRENT_SIZE;
1781	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1782	drop_nlink(inode);
1783	dput(dentry);	/* Undo the count from "create" - this does all the work */
1784	return 0;
1785}
1786
1787static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
1788{
1789	if (!simple_empty(dentry))
1790		return -ENOTEMPTY;
1791
1792	drop_nlink(dentry->d_inode);
1793	drop_nlink(dir);
1794	return shmem_unlink(dir, dentry);
1795}
1796
1797/*
1798 * The VFS layer already does all the dentry stuff for rename,
1799 * we just have to decrement the usage count for the target if
1800 * it exists so that the VFS layer correctly free's it when it
1801 * gets overwritten.
1802 */
1803static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
1804{
1805	struct inode *inode = old_dentry->d_inode;
1806	int they_are_dirs = S_ISDIR(inode->i_mode);
1807
1808	if (!simple_empty(new_dentry))
1809		return -ENOTEMPTY;
1810
1811	if (new_dentry->d_inode) {
1812		(void) shmem_unlink(new_dir, new_dentry);
1813		if (they_are_dirs)
1814			drop_nlink(old_dir);
1815	} else if (they_are_dirs) {
1816		drop_nlink(old_dir);
1817		inc_nlink(new_dir);
1818	}
1819
1820	old_dir->i_size -= BOGO_DIRENT_SIZE;
1821	new_dir->i_size += BOGO_DIRENT_SIZE;
1822	old_dir->i_ctime = old_dir->i_mtime =
1823	new_dir->i_ctime = new_dir->i_mtime =
1824	inode->i_ctime = CURRENT_TIME;
1825	return 0;
1826}
1827
1828static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1829{
1830	int error;
1831	int len;
1832	struct inode *inode;
1833	struct page *page = NULL;
1834	char *kaddr;
1835	struct shmem_inode_info *info;
1836
1837	len = strlen(symname) + 1;
1838	if (len > PAGE_CACHE_SIZE)
1839		return -ENAMETOOLONG;
1840
1841	inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
1842	if (!inode)
1843		return -ENOSPC;
1844
1845	error = security_inode_init_security(inode, dir, NULL, NULL,
1846					     NULL);
1847	if (error) {
1848		if (error != -EOPNOTSUPP) {
1849			iput(inode);
1850			return error;
1851		}
1852		error = 0;
1853	}
1854
1855	info = SHMEM_I(inode);
1856	inode->i_size = len-1;
1857	if (len <= (char *)inode - (char *)info) {
1858		/* do it inline */
1859		memcpy(info, symname, len);
1860		inode->i_op = &shmem_symlink_inline_operations;
1861	} else {
1862		error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
1863		if (error) {
1864			iput(inode);
1865			return error;
1866		}
1867		inode->i_op = &shmem_symlink_inode_operations;
1868		kaddr = kmap_atomic(page, KM_USER0);
1869		memcpy(kaddr, symname, len);
1870		kunmap_atomic(kaddr, KM_USER0);
1871		set_page_dirty(page);
1872		page_cache_release(page);
1873	}
1874	if (dir->i_mode & S_ISGID)
1875		inode->i_gid = dir->i_gid;
1876	dir->i_size += BOGO_DIRENT_SIZE;
1877	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1878	d_instantiate(dentry, inode);
1879	dget(dentry);
1880	return 0;
1881}
1882
1883static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
1884{
1885	nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode));
1886	return NULL;
1887}
1888
1889static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
1890{
1891	struct page *page = NULL;
1892	int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
1893	nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
1894	return page;
1895}
1896
1897static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
1898{
1899	if (!IS_ERR(nd_get_link(nd))) {
1900		struct page *page = cookie;
1901		kunmap(page);
1902		mark_page_accessed(page);
1903		page_cache_release(page);
1904	}
1905}
1906
1907static struct inode_operations shmem_symlink_inline_operations = {
1908	.readlink	= generic_readlink,
1909	.follow_link	= shmem_follow_link_inline,
1910};
1911
1912static struct inode_operations shmem_symlink_inode_operations = {
1913	.truncate	= shmem_truncate,
1914	.readlink	= generic_readlink,
1915	.follow_link	= shmem_follow_link,
1916	.put_link	= shmem_put_link,
1917};
1918
1919#ifdef CONFIG_TMPFS_POSIX_ACL
1920/**
1921 * Superblocks without xattr inode operations will get security.* xattr
1922 * support from the VFS "for free". As soon as we have any other xattrs
1923 * like ACLs, we also need to implement the security.* handlers at
1924 * filesystem level, though.
1925 */
1926
1927static size_t shmem_xattr_security_list(struct inode *inode, char *list,
1928					size_t list_len, const char *name,
1929					size_t name_len)
1930{
1931	return security_inode_listsecurity(inode, list, list_len);
1932}
1933
1934static int shmem_xattr_security_get(struct inode *inode, const char *name,
1935				    void *buffer, size_t size)
1936{
1937	if (strcmp(name, "") == 0)
1938		return -EINVAL;
1939	return security_inode_getsecurity(inode, name, buffer, size,
1940					  -EOPNOTSUPP);
1941}
1942
1943static int shmem_xattr_security_set(struct inode *inode, const char *name,
1944				    const void *value, size_t size, int flags)
1945{
1946	if (strcmp(name, "") == 0)
1947		return -EINVAL;
1948	return security_inode_setsecurity(inode, name, value, size, flags);
1949}
1950
1951static struct xattr_handler shmem_xattr_security_handler = {
1952	.prefix = XATTR_SECURITY_PREFIX,
1953	.list   = shmem_xattr_security_list,
1954	.get    = shmem_xattr_security_get,
1955	.set    = shmem_xattr_security_set,
1956};
1957
1958static struct xattr_handler *shmem_xattr_handlers[] = {
1959	&shmem_xattr_acl_access_handler,
1960	&shmem_xattr_acl_default_handler,
1961	&shmem_xattr_security_handler,
1962	NULL
1963};
1964#endif
1965
1966static struct dentry *shmem_get_parent(struct dentry *child)
1967{
1968	return ERR_PTR(-ESTALE);
1969}
1970
1971static int shmem_match(struct inode *ino, void *vfh)
1972{
1973	__u32 *fh = vfh;
1974	__u64 inum = fh[2];
1975	inum = (inum << 32) | fh[1];
1976	return ino->i_ino == inum && fh[0] == ino->i_generation;
1977}
1978
1979static struct dentry *shmem_get_dentry(struct super_block *sb, void *vfh)
1980{
1981	struct dentry *de = NULL;
1982	struct inode *inode;
1983	__u32 *fh = vfh;
1984	__u64 inum = fh[2];
1985	inum = (inum << 32) | fh[1];
1986
1987	inode = ilookup5(sb, (unsigned long)(inum+fh[0]), shmem_match, vfh);
1988	if (inode) {
1989		de = d_find_alias(inode);
1990		iput(inode);
1991	}
1992
1993	return de? de: ERR_PTR(-ESTALE);
1994}
1995
1996static struct dentry *shmem_decode_fh(struct super_block *sb, __u32 *fh,
1997		int len, int type,
1998		int (*acceptable)(void *context, struct dentry *de),
1999		void *context)
2000{
2001	if (len < 3)
2002		return ERR_PTR(-ESTALE);
2003
2004	return sb->s_export_op->find_exported_dentry(sb, fh, NULL, acceptable,
2005							context);
2006}
2007
2008static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
2009				int connectable)
2010{
2011	struct inode *inode = dentry->d_inode;
2012
2013	if (*len < 3)
2014		return 255;
2015
2016	if (hlist_unhashed(&inode->i_hash)) {
2017		/* Unfortunately insert_inode_hash is not idempotent,
2018		 * so as we hash inodes here rather than at creation
2019		 * time, we need a lock to ensure we only try
2020		 * to do it once
2021		 */
2022		static DEFINE_SPINLOCK(lock);
2023		spin_lock(&lock);
2024		if (hlist_unhashed(&inode->i_hash))
2025			__insert_inode_hash(inode,
2026					    inode->i_ino + inode->i_generation);
2027		spin_unlock(&lock);
2028	}
2029
2030	fh[0] = inode->i_generation;
2031	fh[1] = inode->i_ino;
2032	fh[2] = ((__u64)inode->i_ino) >> 32;
2033
2034	*len = 3;
2035	return 1;
2036}
2037
2038static struct export_operations shmem_export_ops = {
2039	.get_parent     = shmem_get_parent,
2040	.get_dentry     = shmem_get_dentry,
2041	.encode_fh      = shmem_encode_fh,
2042	.decode_fh      = shmem_decode_fh,
2043};
2044
2045static int shmem_parse_options(char *options, int *mode, uid_t *uid,
2046	gid_t *gid, unsigned long *blocks, unsigned long *inodes,
2047	int *policy, nodemask_t *policy_nodes)
2048{
2049	char *this_char, *value, *rest;
2050
2051	while (options != NULL) {
2052		this_char = options;
2053		for (;;) {
2054			/*
2055			 * NUL-terminate this option: unfortunately,
2056			 * mount options form a comma-separated list,
2057			 * but mpol's nodelist may also contain commas.
2058			 */
2059			options = strchr(options, ',');
2060			if (options == NULL)
2061				break;
2062			options++;
2063			if (!isdigit(*options)) {
2064				options[-1] = '\0';
2065				break;
2066			}
2067		}
2068		if (!*this_char)
2069			continue;
2070		if ((value = strchr(this_char,'=')) != NULL) {
2071			*value++ = 0;
2072		} else {
2073			printk(KERN_ERR
2074			    "tmpfs: No value for mount option '%s'\n",
2075			    this_char);
2076			return 1;
2077		}
2078
2079		if (!strcmp(this_char,"size")) {
2080			unsigned long long size;
2081			size = memparse(value,&rest);
2082			if (*rest == '%') {
2083				size <<= PAGE_SHIFT;
2084				size *= totalram_pages;
2085				do_div(size, 100);
2086				rest++;
2087			}
2088			if (*rest)
2089				goto bad_val;
2090			*blocks = size >> PAGE_CACHE_SHIFT;
2091		} else if (!strcmp(this_char,"nr_blocks")) {
2092			*blocks = memparse(value,&rest);
2093			if (*rest)
2094				goto bad_val;
2095		} else if (!strcmp(this_char,"nr_inodes")) {
2096			*inodes = memparse(value,&rest);
2097			if (*rest)
2098				goto bad_val;
2099		} else if (!strcmp(this_char,"mode")) {
2100			if (!mode)
2101				continue;
2102			*mode = simple_strtoul(value,&rest,8);
2103			if (*rest)
2104				goto bad_val;
2105		} else if (!strcmp(this_char,"uid")) {
2106			if (!uid)
2107				continue;
2108			*uid = simple_strtoul(value,&rest,0);
2109			if (*rest)
2110				goto bad_val;
2111		} else if (!strcmp(this_char,"gid")) {
2112			if (!gid)
2113				continue;
2114			*gid = simple_strtoul(value,&rest,0);
2115			if (*rest)
2116				goto bad_val;
2117		} else if (!strcmp(this_char,"mpol")) {
2118			if (shmem_parse_mpol(value,policy,policy_nodes))
2119				goto bad_val;
2120		} else {
2121			printk(KERN_ERR "tmpfs: Bad mount option %s\n",
2122			       this_char);
2123			return 1;
2124		}
2125	}
2126	return 0;
2127
2128bad_val:
2129	printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
2130	       value, this_char);
2131	return 1;
2132
2133}
2134
2135static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2136{
2137	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2138	unsigned long max_blocks = sbinfo->max_blocks;
2139	unsigned long max_inodes = sbinfo->max_inodes;
2140	int policy = sbinfo->policy;
2141	nodemask_t policy_nodes = sbinfo->policy_nodes;
2142	unsigned long blocks;
2143	unsigned long inodes;
2144	int error = -EINVAL;
2145
2146	if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks,
2147				&max_inodes, &policy, &policy_nodes))
2148		return error;
2149
2150	spin_lock(&sbinfo->stat_lock);
2151	blocks = sbinfo->max_blocks - sbinfo->free_blocks;
2152	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
2153	if (max_blocks < blocks)
2154		goto out;
2155	if (max_inodes < inodes)
2156		goto out;
2157	/*
2158	 * Those tests also disallow limited->unlimited while any are in
2159	 * use, so i_blocks will always be zero when max_blocks is zero;
2160	 * but we must separately disallow unlimited->limited, because
2161	 * in that case we have no record of how much is already in use.
2162	 */
2163	if (max_blocks && !sbinfo->max_blocks)
2164		goto out;
2165	if (max_inodes && !sbinfo->max_inodes)
2166		goto out;
2167
2168	error = 0;
2169	sbinfo->max_blocks  = max_blocks;
2170	sbinfo->free_blocks = max_blocks - blocks;
2171	sbinfo->max_inodes  = max_inodes;
2172	sbinfo->free_inodes = max_inodes - inodes;
2173	sbinfo->policy = policy;
2174	sbinfo->policy_nodes = policy_nodes;
2175out:
2176	spin_unlock(&sbinfo->stat_lock);
2177	return error;
2178}
2179#endif
2180
2181static void shmem_put_super(struct super_block *sb)
2182{
2183	kfree(sb->s_fs_info);
2184	sb->s_fs_info = NULL;
2185}
2186
2187static int shmem_fill_super(struct super_block *sb,
2188			    void *data, int silent)
2189{
2190	struct inode *inode;
2191	struct dentry *root;
2192	int mode   = S_IRWXUGO | S_ISVTX;
2193	uid_t uid = current->fsuid;
2194	gid_t gid = current->fsgid;
2195	int err = -ENOMEM;
2196	struct shmem_sb_info *sbinfo;
2197	unsigned long blocks = 0;
2198	unsigned long inodes = 0;
2199	int policy = MPOL_DEFAULT;
2200	nodemask_t policy_nodes = node_online_map;
2201
2202#ifdef CONFIG_TMPFS
2203	/*
2204	 * Per default we only allow half of the physical ram per
2205	 * tmpfs instance, limiting inodes to one per page of lowmem;
2206	 * but the internal instance is left unlimited.
2207	 */
2208	if (!(sb->s_flags & MS_NOUSER)) {
2209		blocks = totalram_pages / 2;
2210		inodes = totalram_pages - totalhigh_pages;
2211		if (inodes > blocks)
2212			inodes = blocks;
2213		if (shmem_parse_options(data, &mode, &uid, &gid, &blocks,
2214					&inodes, &policy, &policy_nodes))
2215			return -EINVAL;
2216	}
2217	sb->s_export_op = &shmem_export_ops;
2218#else
2219	sb->s_flags |= MS_NOUSER;
2220#endif
2221
2222	/* Round up to L1_CACHE_BYTES to resist false sharing */
2223	sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info),
2224				L1_CACHE_BYTES), GFP_KERNEL);
2225	if (!sbinfo)
2226		return -ENOMEM;
2227
2228	spin_lock_init(&sbinfo->stat_lock);
2229	sbinfo->max_blocks = blocks;
2230	sbinfo->free_blocks = blocks;
2231	sbinfo->max_inodes = inodes;
2232	sbinfo->free_inodes = inodes;
2233	sbinfo->policy = policy;
2234	sbinfo->policy_nodes = policy_nodes;
2235
2236	sb->s_fs_info = sbinfo;
2237	sb->s_maxbytes = SHMEM_MAX_BYTES;
2238	sb->s_blocksize = PAGE_CACHE_SIZE;
2239	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
2240	sb->s_magic = TMPFS_MAGIC;
2241	sb->s_op = &shmem_ops;
2242	sb->s_time_gran = 1;
2243#ifdef CONFIG_TMPFS_POSIX_ACL
2244	sb->s_xattr = shmem_xattr_handlers;
2245	sb->s_flags |= MS_POSIXACL;
2246#endif
2247
2248	inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
2249	if (!inode)
2250		goto failed;
2251	inode->i_uid = uid;
2252	inode->i_gid = gid;
2253	root = d_alloc_root(inode);
2254	if (!root)
2255		goto failed_iput;
2256	sb->s_root = root;
2257	return 0;
2258
2259failed_iput:
2260	iput(inode);
2261failed:
2262	shmem_put_super(sb);
2263	return err;
2264}
2265
2266static struct kmem_cache *shmem_inode_cachep;
2267
2268static struct inode *shmem_alloc_inode(struct super_block *sb)
2269{
2270	struct shmem_inode_info *p;
2271	p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
2272	if (!p)
2273		return NULL;
2274	return &p->vfs_inode;
2275}
2276
2277static void shmem_destroy_inode(struct inode *inode)
2278{
2279	if ((inode->i_mode & S_IFMT) == S_IFREG) {
2280		/* only struct inode is valid if it's an inline symlink */
2281		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2282	}
2283	shmem_acl_destroy_inode(inode);
2284	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2285}
2286
2287static void init_once(void *foo, struct kmem_cache *cachep,
2288		      unsigned long flags)
2289{
2290	struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2291
2292	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
2293	    SLAB_CTOR_CONSTRUCTOR) {
2294		inode_init_once(&p->vfs_inode);
2295#ifdef CONFIG_TMPFS_POSIX_ACL
2296		p->i_acl = NULL;
2297		p->i_default_acl = NULL;
2298#endif
2299	}
2300}
2301
2302static int init_inodecache(void)
2303{
2304	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
2305				sizeof(struct shmem_inode_info),
2306				0, 0, init_once, NULL);
2307	if (shmem_inode_cachep == NULL)
2308		return -ENOMEM;
2309	return 0;
2310}
2311
2312static void destroy_inodecache(void)
2313{
2314	kmem_cache_destroy(shmem_inode_cachep);
2315}
2316
2317static const struct address_space_operations shmem_aops = {
2318	.writepage	= shmem_writepage,
2319	.set_page_dirty	= __set_page_dirty_no_writeback,
2320#ifdef CONFIG_TMPFS
2321	.prepare_write	= shmem_prepare_write,
2322	.commit_write	= simple_commit_write,
2323#endif
2324	.migratepage	= migrate_page,
2325};
2326
2327static const struct file_operations shmem_file_operations = {
2328	.mmap		= shmem_mmap,
2329#ifdef CONFIG_TMPFS
2330	.llseek		= generic_file_llseek,
2331	.read		= shmem_file_read,
2332	.write		= shmem_file_write,
2333	.fsync		= simple_sync_file,
2334	.sendfile	= shmem_file_sendfile,
2335#endif
2336};
2337
2338static struct inode_operations shmem_inode_operations = {
2339	.truncate	= shmem_truncate,
2340	.setattr	= shmem_notify_change,
2341	.truncate_range	= shmem_truncate_range,
2342#ifdef CONFIG_TMPFS_POSIX_ACL
2343	.setxattr	= generic_setxattr,
2344	.getxattr	= generic_getxattr,
2345	.listxattr	= generic_listxattr,
2346	.removexattr	= generic_removexattr,
2347	.permission	= shmem_permission,
2348#endif
2349
2350};
2351
2352static struct inode_operations shmem_dir_inode_operations = {
2353#ifdef CONFIG_TMPFS
2354	.create		= shmem_create,
2355	.lookup		= simple_lookup,
2356	.link		= shmem_link,
2357	.unlink		= shmem_unlink,
2358	.symlink	= shmem_symlink,
2359	.mkdir		= shmem_mkdir,
2360	.rmdir		= shmem_rmdir,
2361	.mknod		= shmem_mknod,
2362	.rename		= shmem_rename,
2363#endif
2364#ifdef CONFIG_TMPFS_POSIX_ACL
2365	.setattr	= shmem_notify_change,
2366	.setxattr	= generic_setxattr,
2367	.getxattr	= generic_getxattr,
2368	.listxattr	= generic_listxattr,
2369	.removexattr	= generic_removexattr,
2370	.permission	= shmem_permission,
2371#endif
2372};
2373
2374static struct inode_operations shmem_special_inode_operations = {
2375#ifdef CONFIG_TMPFS_POSIX_ACL
2376	.setattr	= shmem_notify_change,
2377	.setxattr	= generic_setxattr,
2378	.getxattr	= generic_getxattr,
2379	.listxattr	= generic_listxattr,
2380	.removexattr	= generic_removexattr,
2381	.permission	= shmem_permission,
2382#endif
2383};
2384
2385static struct super_operations shmem_ops = {
2386	.alloc_inode	= shmem_alloc_inode,
2387	.destroy_inode	= shmem_destroy_inode,
2388#ifdef CONFIG_TMPFS
2389	.statfs		= shmem_statfs,
2390	.remount_fs	= shmem_remount_fs,
2391#endif
2392	.delete_inode	= shmem_delete_inode,
2393	.drop_inode	= generic_delete_inode,
2394	.put_super	= shmem_put_super,
2395};
2396
2397static struct vm_operations_struct shmem_vm_ops = {
2398	.nopage		= shmem_nopage,
2399	.populate	= shmem_populate,
2400#ifdef CONFIG_NUMA
2401	.set_policy     = shmem_set_policy,
2402	.get_policy     = shmem_get_policy,
2403#endif
2404};
2405
2406
2407static int shmem_get_sb(struct file_system_type *fs_type,
2408	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
2409{
2410	return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
2411}
2412
2413static struct file_system_type tmpfs_fs_type = {
2414	.owner		= THIS_MODULE,
2415	.name		= "tmpfs",
2416	.get_sb		= shmem_get_sb,
2417	.kill_sb	= kill_litter_super,
2418};
2419static struct vfsmount *shm_mnt;
2420
2421static int __init init_tmpfs(void)
2422{
2423	int error;
2424
2425	error = init_inodecache();
2426	if (error)
2427		goto out3;
2428
2429	error = register_filesystem(&tmpfs_fs_type);
2430	if (error) {
2431		printk(KERN_ERR "Could not register tmpfs\n");
2432		goto out2;
2433	}
2434
2435	shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER,
2436				tmpfs_fs_type.name, NULL);
2437	if (IS_ERR(shm_mnt)) {
2438		error = PTR_ERR(shm_mnt);
2439		printk(KERN_ERR "Could not kern_mount tmpfs\n");
2440		goto out1;
2441	}
2442	return 0;
2443
2444out1:
2445	unregister_filesystem(&tmpfs_fs_type);
2446out2:
2447	destroy_inodecache();
2448out3:
2449	shm_mnt = ERR_PTR(error);
2450	return error;
2451}
2452module_init(init_tmpfs)
2453
2454/*
2455 * shmem_file_setup - get an unlinked file living in tmpfs
2456 *
2457 * @name: name for dentry (to be seen in /proc/<pid>/maps
2458 * @size: size to be set for the file
2459 *
2460 */
2461struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
2462{
2463	int error;
2464	struct file *file;
2465	struct inode *inode;
2466	struct dentry *dentry, *root;
2467	struct qstr this;
2468
2469	if (IS_ERR(shm_mnt))
2470		return (void *)shm_mnt;
2471
2472	if (size < 0 || size > SHMEM_MAX_BYTES)
2473		return ERR_PTR(-EINVAL);
2474
2475	if (shmem_acct_size(flags, size))
2476		return ERR_PTR(-ENOMEM);
2477
2478	error = -ENOMEM;
2479	this.name = name;
2480	this.len = strlen(name);
2481	this.hash = 0; /* will go */
2482	root = shm_mnt->mnt_root;
2483	dentry = d_alloc(root, &this);
2484	if (!dentry)
2485		goto put_memory;
2486
2487	error = -ENFILE;
2488	file = get_empty_filp();
2489	if (!file)
2490		goto put_dentry;
2491
2492	error = -ENOSPC;
2493	inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
2494	if (!inode)
2495		goto close_file;
2496
2497	SHMEM_I(inode)->flags = flags & VM_ACCOUNT;
2498	d_instantiate(dentry, inode);
2499	inode->i_size = size;
2500	inode->i_nlink = 0;	/* It is unlinked */
2501	file->f_path.mnt = mntget(shm_mnt);
2502	file->f_path.dentry = dentry;
2503	file->f_mapping = inode->i_mapping;
2504	file->f_op = &shmem_file_operations;
2505	file->f_mode = FMODE_WRITE | FMODE_READ;
2506	return file;
2507
2508close_file:
2509	put_filp(file);
2510put_dentry:
2511	dput(dentry);
2512put_memory:
2513	shmem_unacct_size(flags, size);
2514	return ERR_PTR(error);
2515}
2516
2517/*
2518 * shmem_zero_setup - setup a shared anonymous mapping
2519 *
2520 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
2521 */
2522int shmem_zero_setup(struct vm_area_struct *vma)
2523{
2524	struct file *file;
2525	loff_t size = vma->vm_end - vma->vm_start;
2526
2527	file = shmem_file_setup("dev/zero", size, vma->vm_flags);
2528	if (IS_ERR(file))
2529		return PTR_ERR(file);
2530
2531	if (vma->vm_file)
2532		fput(vma->vm_file);
2533	vma->vm_file = file;
2534	vma->vm_ops = &shmem_vm_ops;
2535	return 0;
2536}
2537