shmem.c revision 759b9775c25f5e69aaea8a75c3914019e2dc5539
1/*
2 * Resizable virtual memory filesystem for Linux.
3 *
4 * Copyright (C) 2000 Linus Torvalds.
5 *		 2000 Transmeta Corp.
6 *		 2000-2001 Christoph Rohland
7 *		 2000-2001 SAP AG
8 *		 2002 Red Hat Inc.
9 * Copyright (C) 2002-2005 Hugh Dickins.
10 * Copyright (C) 2002-2005 VERITAS Software Corporation.
11 * Copyright (C) 2004 Andi Kleen, SuSE Labs
12 *
13 * Extended attribute support for tmpfs:
14 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
15 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
16 *
17 * This file is released under the GPL.
18 */
19
20/*
21 * This virtual memory filesystem is heavily based on the ramfs. It
22 * extends ramfs by the ability to use swap and honor resource limits
23 * which makes it a completely usable filesystem.
24 */
25
26#include <linux/module.h>
27#include <linux/init.h>
28#include <linux/fs.h>
29#include <linux/xattr.h>
30#include <linux/generic_acl.h>
31#include <linux/mm.h>
32#include <linux/mman.h>
33#include <linux/file.h>
34#include <linux/swap.h>
35#include <linux/pagemap.h>
36#include <linux/string.h>
37#include <linux/slab.h>
38#include <linux/backing-dev.h>
39#include <linux/shmem_fs.h>
40#include <linux/mount.h>
41#include <linux/writeback.h>
42#include <linux/vfs.h>
43#include <linux/blkdev.h>
44#include <linux/security.h>
45#include <linux/swapops.h>
46#include <linux/mempolicy.h>
47#include <linux/namei.h>
48#include <linux/ctype.h>
49#include <linux/migrate.h>
50#include <linux/highmem.h>
51#include <linux/backing-dev.h>
52
53#include <asm/uaccess.h>
54#include <asm/div64.h>
55#include <asm/pgtable.h>
56
57/* This magic number is used in glibc for posix shared memory */
58#define TMPFS_MAGIC	0x01021994
59
60#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
61#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
62#define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
63
64#define SHMEM_MAX_INDEX  (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
65#define SHMEM_MAX_BYTES  ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT)
66
67#define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
68
69/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
70#define SHMEM_PAGEIN	 VM_READ
71#define SHMEM_TRUNCATE	 VM_WRITE
72
73/* Definition to limit shmem_truncate's steps between cond_rescheds */
74#define LATENCY_LIMIT	 64
75
76/* Pretend that each entry is of this size in directory's i_size */
77#define BOGO_DIRENT_SIZE 20
78
79/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
80enum sgp_type {
81	SGP_QUICK,	/* don't try more than file page cache lookup */
82	SGP_READ,	/* don't exceed i_size, don't allocate page */
83	SGP_CACHE,	/* don't exceed i_size, may allocate page */
84	SGP_WRITE,	/* may exceed i_size, may allocate page */
85};
86
87static int shmem_getpage(struct inode *inode, unsigned long idx,
88			 struct page **pagep, enum sgp_type sgp, int *type);
89
90static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
91{
92	/*
93	 * The above definition of ENTRIES_PER_PAGE, and the use of
94	 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
95	 * might be reconsidered if it ever diverges from PAGE_SIZE.
96	 */
97	return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
98}
99
100static inline void shmem_dir_free(struct page *page)
101{
102	__free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
103}
104
105static struct page **shmem_dir_map(struct page *page)
106{
107	return (struct page **)kmap_atomic(page, KM_USER0);
108}
109
110static inline void shmem_dir_unmap(struct page **dir)
111{
112	kunmap_atomic(dir, KM_USER0);
113}
114
115static swp_entry_t *shmem_swp_map(struct page *page)
116{
117	return (swp_entry_t *)kmap_atomic(page, KM_USER1);
118}
119
120static inline void shmem_swp_balance_unmap(void)
121{
122	/*
123	 * When passing a pointer to an i_direct entry, to code which
124	 * also handles indirect entries and so will shmem_swp_unmap,
125	 * we must arrange for the preempt count to remain in balance.
126	 * What kmap_atomic of a lowmem page does depends on config
127	 * and architecture, so pretend to kmap_atomic some lowmem page.
128	 */
129	(void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
130}
131
132static inline void shmem_swp_unmap(swp_entry_t *entry)
133{
134	kunmap_atomic(entry, KM_USER1);
135}
136
137static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
138{
139	return sb->s_fs_info;
140}
141
142/*
143 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
144 * for shared memory and for shared anonymous (/dev/zero) mappings
145 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
146 * consistent with the pre-accounting of private mappings ...
147 */
148static inline int shmem_acct_size(unsigned long flags, loff_t size)
149{
150	return (flags & VM_ACCOUNT)?
151		security_vm_enough_memory(VM_ACCT(size)): 0;
152}
153
154static inline void shmem_unacct_size(unsigned long flags, loff_t size)
155{
156	if (flags & VM_ACCOUNT)
157		vm_unacct_memory(VM_ACCT(size));
158}
159
160/*
161 * ... whereas tmpfs objects are accounted incrementally as
162 * pages are allocated, in order to allow huge sparse files.
163 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
164 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
165 */
166static inline int shmem_acct_block(unsigned long flags)
167{
168	return (flags & VM_ACCOUNT)?
169		0: security_vm_enough_memory(VM_ACCT(PAGE_CACHE_SIZE));
170}
171
172static inline void shmem_unacct_blocks(unsigned long flags, long pages)
173{
174	if (!(flags & VM_ACCOUNT))
175		vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
176}
177
178static const struct super_operations shmem_ops;
179static const struct address_space_operations shmem_aops;
180static const struct file_operations shmem_file_operations;
181static const struct inode_operations shmem_inode_operations;
182static const struct inode_operations shmem_dir_inode_operations;
183static const struct inode_operations shmem_special_inode_operations;
184static struct vm_operations_struct shmem_vm_ops;
185
186static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
187	.ra_pages	= 0,	/* No readahead */
188	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
189	.unplug_io_fn	= default_unplug_io_fn,
190};
191
192static LIST_HEAD(shmem_swaplist);
193static DEFINE_SPINLOCK(shmem_swaplist_lock);
194
195static void shmem_free_blocks(struct inode *inode, long pages)
196{
197	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
198	if (sbinfo->max_blocks) {
199		spin_lock(&sbinfo->stat_lock);
200		sbinfo->free_blocks += pages;
201		inode->i_blocks -= pages*BLOCKS_PER_PAGE;
202		spin_unlock(&sbinfo->stat_lock);
203	}
204}
205
206/*
207 * shmem_recalc_inode - recalculate the size of an inode
208 *
209 * @inode: inode to recalc
210 *
211 * We have to calculate the free blocks since the mm can drop
212 * undirtied hole pages behind our back.
213 *
214 * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
215 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
216 *
217 * It has to be called with the spinlock held.
218 */
219static void shmem_recalc_inode(struct inode *inode)
220{
221	struct shmem_inode_info *info = SHMEM_I(inode);
222	long freed;
223
224	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
225	if (freed > 0) {
226		info->alloced -= freed;
227		shmem_unacct_blocks(info->flags, freed);
228		shmem_free_blocks(inode, freed);
229	}
230}
231
232/*
233 * shmem_swp_entry - find the swap vector position in the info structure
234 *
235 * @info:  info structure for the inode
236 * @index: index of the page to find
237 * @page:  optional page to add to the structure. Has to be preset to
238 *         all zeros
239 *
240 * If there is no space allocated yet it will return NULL when
241 * page is NULL, else it will use the page for the needed block,
242 * setting it to NULL on return to indicate that it has been used.
243 *
244 * The swap vector is organized the following way:
245 *
246 * There are SHMEM_NR_DIRECT entries directly stored in the
247 * shmem_inode_info structure. So small files do not need an addional
248 * allocation.
249 *
250 * For pages with index > SHMEM_NR_DIRECT there is the pointer
251 * i_indirect which points to a page which holds in the first half
252 * doubly indirect blocks, in the second half triple indirect blocks:
253 *
254 * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
255 * following layout (for SHMEM_NR_DIRECT == 16):
256 *
257 * i_indirect -> dir --> 16-19
258 * 	      |	     +-> 20-23
259 * 	      |
260 * 	      +-->dir2 --> 24-27
261 * 	      |	       +-> 28-31
262 * 	      |	       +-> 32-35
263 * 	      |	       +-> 36-39
264 * 	      |
265 * 	      +-->dir3 --> 40-43
266 * 	       	       +-> 44-47
267 * 	      	       +-> 48-51
268 * 	      	       +-> 52-55
269 */
270static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
271{
272	unsigned long offset;
273	struct page **dir;
274	struct page *subdir;
275
276	if (index < SHMEM_NR_DIRECT) {
277		shmem_swp_balance_unmap();
278		return info->i_direct+index;
279	}
280	if (!info->i_indirect) {
281		if (page) {
282			info->i_indirect = *page;
283			*page = NULL;
284		}
285		return NULL;			/* need another page */
286	}
287
288	index -= SHMEM_NR_DIRECT;
289	offset = index % ENTRIES_PER_PAGE;
290	index /= ENTRIES_PER_PAGE;
291	dir = shmem_dir_map(info->i_indirect);
292
293	if (index >= ENTRIES_PER_PAGE/2) {
294		index -= ENTRIES_PER_PAGE/2;
295		dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
296		index %= ENTRIES_PER_PAGE;
297		subdir = *dir;
298		if (!subdir) {
299			if (page) {
300				*dir = *page;
301				*page = NULL;
302			}
303			shmem_dir_unmap(dir);
304			return NULL;		/* need another page */
305		}
306		shmem_dir_unmap(dir);
307		dir = shmem_dir_map(subdir);
308	}
309
310	dir += index;
311	subdir = *dir;
312	if (!subdir) {
313		if (!page || !(subdir = *page)) {
314			shmem_dir_unmap(dir);
315			return NULL;		/* need a page */
316		}
317		*dir = subdir;
318		*page = NULL;
319	}
320	shmem_dir_unmap(dir);
321	return shmem_swp_map(subdir) + offset;
322}
323
324static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
325{
326	long incdec = value? 1: -1;
327
328	entry->val = value;
329	info->swapped += incdec;
330	if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
331		struct page *page = kmap_atomic_to_page(entry);
332		set_page_private(page, page_private(page) + incdec);
333	}
334}
335
336/*
337 * shmem_swp_alloc - get the position of the swap entry for the page.
338 *                   If it does not exist allocate the entry.
339 *
340 * @info:	info structure for the inode
341 * @index:	index of the page to find
342 * @sgp:	check and recheck i_size? skip allocation?
343 */
344static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
345{
346	struct inode *inode = &info->vfs_inode;
347	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
348	struct page *page = NULL;
349	swp_entry_t *entry;
350
351	if (sgp != SGP_WRITE &&
352	    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
353		return ERR_PTR(-EINVAL);
354
355	while (!(entry = shmem_swp_entry(info, index, &page))) {
356		if (sgp == SGP_READ)
357			return shmem_swp_map(ZERO_PAGE(0));
358		/*
359		 * Test free_blocks against 1 not 0, since we have 1 data
360		 * page (and perhaps indirect index pages) yet to allocate:
361		 * a waste to allocate index if we cannot allocate data.
362		 */
363		if (sbinfo->max_blocks) {
364			spin_lock(&sbinfo->stat_lock);
365			if (sbinfo->free_blocks <= 1) {
366				spin_unlock(&sbinfo->stat_lock);
367				return ERR_PTR(-ENOSPC);
368			}
369			sbinfo->free_blocks--;
370			inode->i_blocks += BLOCKS_PER_PAGE;
371			spin_unlock(&sbinfo->stat_lock);
372		}
373
374		spin_unlock(&info->lock);
375		page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
376		if (page)
377			set_page_private(page, 0);
378		spin_lock(&info->lock);
379
380		if (!page) {
381			shmem_free_blocks(inode, 1);
382			return ERR_PTR(-ENOMEM);
383		}
384		if (sgp != SGP_WRITE &&
385		    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
386			entry = ERR_PTR(-EINVAL);
387			break;
388		}
389		if (info->next_index <= index)
390			info->next_index = index + 1;
391	}
392	if (page) {
393		/* another task gave its page, or truncated the file */
394		shmem_free_blocks(inode, 1);
395		shmem_dir_free(page);
396	}
397	if (info->next_index <= index && !IS_ERR(entry))
398		info->next_index = index + 1;
399	return entry;
400}
401
402/*
403 * shmem_free_swp - free some swap entries in a directory
404 *
405 * @dir:   pointer to the directory
406 * @edir:  pointer after last entry of the directory
407 */
408static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir)
409{
410	swp_entry_t *ptr;
411	int freed = 0;
412
413	for (ptr = dir; ptr < edir; ptr++) {
414		if (ptr->val) {
415			free_swap_and_cache(*ptr);
416			*ptr = (swp_entry_t){0};
417			freed++;
418		}
419	}
420	return freed;
421}
422
423static int shmem_map_and_free_swp(struct page *subdir,
424		int offset, int limit, struct page ***dir)
425{
426	swp_entry_t *ptr;
427	int freed = 0;
428
429	ptr = shmem_swp_map(subdir);
430	for (; offset < limit; offset += LATENCY_LIMIT) {
431		int size = limit - offset;
432		if (size > LATENCY_LIMIT)
433			size = LATENCY_LIMIT;
434		freed += shmem_free_swp(ptr+offset, ptr+offset+size);
435		if (need_resched()) {
436			shmem_swp_unmap(ptr);
437			if (*dir) {
438				shmem_dir_unmap(*dir);
439				*dir = NULL;
440			}
441			cond_resched();
442			ptr = shmem_swp_map(subdir);
443		}
444	}
445	shmem_swp_unmap(ptr);
446	return freed;
447}
448
449static void shmem_free_pages(struct list_head *next)
450{
451	struct page *page;
452	int freed = 0;
453
454	do {
455		page = container_of(next, struct page, lru);
456		next = next->next;
457		shmem_dir_free(page);
458		freed++;
459		if (freed >= LATENCY_LIMIT) {
460			cond_resched();
461			freed = 0;
462		}
463	} while (next);
464}
465
466static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
467{
468	struct shmem_inode_info *info = SHMEM_I(inode);
469	unsigned long idx;
470	unsigned long size;
471	unsigned long limit;
472	unsigned long stage;
473	unsigned long diroff;
474	struct page **dir;
475	struct page *topdir;
476	struct page *middir;
477	struct page *subdir;
478	swp_entry_t *ptr;
479	LIST_HEAD(pages_to_free);
480	long nr_pages_to_free = 0;
481	long nr_swaps_freed = 0;
482	int offset;
483	int freed;
484	int punch_hole = 0;
485
486	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
487	idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
488	if (idx >= info->next_index)
489		return;
490
491	spin_lock(&info->lock);
492	info->flags |= SHMEM_TRUNCATE;
493	if (likely(end == (loff_t) -1)) {
494		limit = info->next_index;
495		info->next_index = idx;
496	} else {
497		limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
498		if (limit > info->next_index)
499			limit = info->next_index;
500		punch_hole = 1;
501	}
502
503	topdir = info->i_indirect;
504	if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
505		info->i_indirect = NULL;
506		nr_pages_to_free++;
507		list_add(&topdir->lru, &pages_to_free);
508	}
509	spin_unlock(&info->lock);
510
511	if (info->swapped && idx < SHMEM_NR_DIRECT) {
512		ptr = info->i_direct;
513		size = limit;
514		if (size > SHMEM_NR_DIRECT)
515			size = SHMEM_NR_DIRECT;
516		nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size);
517	}
518
519	/*
520	 * If there are no indirect blocks or we are punching a hole
521	 * below indirect blocks, nothing to be done.
522	 */
523	if (!topdir || (punch_hole && (limit <= SHMEM_NR_DIRECT)))
524		goto done2;
525
526	BUG_ON(limit <= SHMEM_NR_DIRECT);
527	limit -= SHMEM_NR_DIRECT;
528	idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
529	offset = idx % ENTRIES_PER_PAGE;
530	idx -= offset;
531
532	dir = shmem_dir_map(topdir);
533	stage = ENTRIES_PER_PAGEPAGE/2;
534	if (idx < ENTRIES_PER_PAGEPAGE/2) {
535		middir = topdir;
536		diroff = idx/ENTRIES_PER_PAGE;
537	} else {
538		dir += ENTRIES_PER_PAGE/2;
539		dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
540		while (stage <= idx)
541			stage += ENTRIES_PER_PAGEPAGE;
542		middir = *dir;
543		if (*dir) {
544			diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
545				ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
546			if (!diroff && !offset) {
547				*dir = NULL;
548				nr_pages_to_free++;
549				list_add(&middir->lru, &pages_to_free);
550			}
551			shmem_dir_unmap(dir);
552			dir = shmem_dir_map(middir);
553		} else {
554			diroff = 0;
555			offset = 0;
556			idx = stage;
557		}
558	}
559
560	for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) {
561		if (unlikely(idx == stage)) {
562			shmem_dir_unmap(dir);
563			dir = shmem_dir_map(topdir) +
564			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
565			while (!*dir) {
566				dir++;
567				idx += ENTRIES_PER_PAGEPAGE;
568				if (idx >= limit)
569					goto done1;
570			}
571			stage = idx + ENTRIES_PER_PAGEPAGE;
572			middir = *dir;
573			*dir = NULL;
574			nr_pages_to_free++;
575			list_add(&middir->lru, &pages_to_free);
576			shmem_dir_unmap(dir);
577			cond_resched();
578			dir = shmem_dir_map(middir);
579			diroff = 0;
580		}
581		subdir = dir[diroff];
582		if (subdir && page_private(subdir)) {
583			size = limit - idx;
584			if (size > ENTRIES_PER_PAGE)
585				size = ENTRIES_PER_PAGE;
586			freed = shmem_map_and_free_swp(subdir,
587						offset, size, &dir);
588			if (!dir)
589				dir = shmem_dir_map(middir);
590			nr_swaps_freed += freed;
591			if (offset)
592				spin_lock(&info->lock);
593			set_page_private(subdir, page_private(subdir) - freed);
594			if (offset)
595				spin_unlock(&info->lock);
596			if (!punch_hole)
597				BUG_ON(page_private(subdir) > offset);
598		}
599		if (offset)
600			offset = 0;
601		else if (subdir && !page_private(subdir)) {
602			dir[diroff] = NULL;
603			nr_pages_to_free++;
604			list_add(&subdir->lru, &pages_to_free);
605		}
606	}
607done1:
608	shmem_dir_unmap(dir);
609done2:
610	if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
611		/*
612		 * Call truncate_inode_pages again: racing shmem_unuse_inode
613		 * may have swizzled a page in from swap since vmtruncate or
614		 * generic_delete_inode did it, before we lowered next_index.
615		 * Also, though shmem_getpage checks i_size before adding to
616		 * cache, no recheck after: so fix the narrow window there too.
617		 */
618		truncate_inode_pages_range(inode->i_mapping, start, end);
619	}
620
621	spin_lock(&info->lock);
622	info->flags &= ~SHMEM_TRUNCATE;
623	info->swapped -= nr_swaps_freed;
624	if (nr_pages_to_free)
625		shmem_free_blocks(inode, nr_pages_to_free);
626	shmem_recalc_inode(inode);
627	spin_unlock(&info->lock);
628
629	/*
630	 * Empty swap vector directory pages to be freed?
631	 */
632	if (!list_empty(&pages_to_free)) {
633		pages_to_free.prev->next = NULL;
634		shmem_free_pages(pages_to_free.next);
635	}
636}
637
638static void shmem_truncate(struct inode *inode)
639{
640	shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
641}
642
643static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
644{
645	struct inode *inode = dentry->d_inode;
646	struct page *page = NULL;
647	int error;
648
649	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
650		if (attr->ia_size < inode->i_size) {
651			/*
652			 * If truncating down to a partial page, then
653			 * if that page is already allocated, hold it
654			 * in memory until the truncation is over, so
655			 * truncate_partial_page cannnot miss it were
656			 * it assigned to swap.
657			 */
658			if (attr->ia_size & (PAGE_CACHE_SIZE-1)) {
659				(void) shmem_getpage(inode,
660					attr->ia_size>>PAGE_CACHE_SHIFT,
661						&page, SGP_READ, NULL);
662			}
663			/*
664			 * Reset SHMEM_PAGEIN flag so that shmem_truncate can
665			 * detect if any pages might have been added to cache
666			 * after truncate_inode_pages.  But we needn't bother
667			 * if it's being fully truncated to zero-length: the
668			 * nrpages check is efficient enough in that case.
669			 */
670			if (attr->ia_size) {
671				struct shmem_inode_info *info = SHMEM_I(inode);
672				spin_lock(&info->lock);
673				info->flags &= ~SHMEM_PAGEIN;
674				spin_unlock(&info->lock);
675			}
676		}
677	}
678
679	error = inode_change_ok(inode, attr);
680	if (!error)
681		error = inode_setattr(inode, attr);
682#ifdef CONFIG_TMPFS_POSIX_ACL
683	if (!error && (attr->ia_valid & ATTR_MODE))
684		error = generic_acl_chmod(inode, &shmem_acl_ops);
685#endif
686	if (page)
687		page_cache_release(page);
688	return error;
689}
690
691static void shmem_delete_inode(struct inode *inode)
692{
693	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
694	struct shmem_inode_info *info = SHMEM_I(inode);
695
696	if (inode->i_op->truncate == shmem_truncate) {
697		truncate_inode_pages(inode->i_mapping, 0);
698		shmem_unacct_size(info->flags, inode->i_size);
699		inode->i_size = 0;
700		shmem_truncate(inode);
701		if (!list_empty(&info->swaplist)) {
702			spin_lock(&shmem_swaplist_lock);
703			list_del_init(&info->swaplist);
704			spin_unlock(&shmem_swaplist_lock);
705		}
706	}
707	BUG_ON(inode->i_blocks);
708	if (sbinfo->max_inodes) {
709		spin_lock(&sbinfo->stat_lock);
710		sbinfo->free_inodes++;
711		spin_unlock(&sbinfo->stat_lock);
712	}
713	clear_inode(inode);
714}
715
716static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
717{
718	swp_entry_t *ptr;
719
720	for (ptr = dir; ptr < edir; ptr++) {
721		if (ptr->val == entry.val)
722			return ptr - dir;
723	}
724	return -1;
725}
726
727static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
728{
729	struct inode *inode;
730	unsigned long idx;
731	unsigned long size;
732	unsigned long limit;
733	unsigned long stage;
734	struct page **dir;
735	struct page *subdir;
736	swp_entry_t *ptr;
737	int offset;
738
739	idx = 0;
740	ptr = info->i_direct;
741	spin_lock(&info->lock);
742	limit = info->next_index;
743	size = limit;
744	if (size > SHMEM_NR_DIRECT)
745		size = SHMEM_NR_DIRECT;
746	offset = shmem_find_swp(entry, ptr, ptr+size);
747	if (offset >= 0) {
748		shmem_swp_balance_unmap();
749		goto found;
750	}
751	if (!info->i_indirect)
752		goto lost2;
753
754	dir = shmem_dir_map(info->i_indirect);
755	stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
756
757	for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
758		if (unlikely(idx == stage)) {
759			shmem_dir_unmap(dir-1);
760			dir = shmem_dir_map(info->i_indirect) +
761			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
762			while (!*dir) {
763				dir++;
764				idx += ENTRIES_PER_PAGEPAGE;
765				if (idx >= limit)
766					goto lost1;
767			}
768			stage = idx + ENTRIES_PER_PAGEPAGE;
769			subdir = *dir;
770			shmem_dir_unmap(dir);
771			dir = shmem_dir_map(subdir);
772		}
773		subdir = *dir;
774		if (subdir && page_private(subdir)) {
775			ptr = shmem_swp_map(subdir);
776			size = limit - idx;
777			if (size > ENTRIES_PER_PAGE)
778				size = ENTRIES_PER_PAGE;
779			offset = shmem_find_swp(entry, ptr, ptr+size);
780			if (offset >= 0) {
781				shmem_dir_unmap(dir);
782				goto found;
783			}
784			shmem_swp_unmap(ptr);
785		}
786	}
787lost1:
788	shmem_dir_unmap(dir-1);
789lost2:
790	spin_unlock(&info->lock);
791	return 0;
792found:
793	idx += offset;
794	inode = &info->vfs_inode;
795	if (move_from_swap_cache(page, idx, inode->i_mapping) == 0) {
796		info->flags |= SHMEM_PAGEIN;
797		shmem_swp_set(info, ptr + offset, 0);
798	}
799	shmem_swp_unmap(ptr);
800	spin_unlock(&info->lock);
801	/*
802	 * Decrement swap count even when the entry is left behind:
803	 * try_to_unuse will skip over mms, then reincrement count.
804	 */
805	swap_free(entry);
806	return 1;
807}
808
809/*
810 * shmem_unuse() search for an eventually swapped out shmem page.
811 */
812int shmem_unuse(swp_entry_t entry, struct page *page)
813{
814	struct list_head *p, *next;
815	struct shmem_inode_info *info;
816	int found = 0;
817
818	spin_lock(&shmem_swaplist_lock);
819	list_for_each_safe(p, next, &shmem_swaplist) {
820		info = list_entry(p, struct shmem_inode_info, swaplist);
821		if (!info->swapped)
822			list_del_init(&info->swaplist);
823		else if (shmem_unuse_inode(info, entry, page)) {
824			/* move head to start search for next from here */
825			list_move_tail(&shmem_swaplist, &info->swaplist);
826			found = 1;
827			break;
828		}
829	}
830	spin_unlock(&shmem_swaplist_lock);
831	return found;
832}
833
834/*
835 * Move the page from the page cache to the swap cache.
836 */
837static int shmem_writepage(struct page *page, struct writeback_control *wbc)
838{
839	struct shmem_inode_info *info;
840	swp_entry_t *entry, swap;
841	struct address_space *mapping;
842	unsigned long index;
843	struct inode *inode;
844
845	BUG_ON(!PageLocked(page));
846	BUG_ON(page_mapped(page));
847
848	mapping = page->mapping;
849	index = page->index;
850	inode = mapping->host;
851	info = SHMEM_I(inode);
852	if (info->flags & VM_LOCKED)
853		goto redirty;
854	swap = get_swap_page();
855	if (!swap.val)
856		goto redirty;
857
858	spin_lock(&info->lock);
859	shmem_recalc_inode(inode);
860	if (index >= info->next_index) {
861		BUG_ON(!(info->flags & SHMEM_TRUNCATE));
862		goto unlock;
863	}
864	entry = shmem_swp_entry(info, index, NULL);
865	BUG_ON(!entry);
866	BUG_ON(entry->val);
867
868	if (move_to_swap_cache(page, swap) == 0) {
869		shmem_swp_set(info, entry, swap.val);
870		shmem_swp_unmap(entry);
871		spin_unlock(&info->lock);
872		if (list_empty(&info->swaplist)) {
873			spin_lock(&shmem_swaplist_lock);
874			/* move instead of add in case we're racing */
875			list_move_tail(&info->swaplist, &shmem_swaplist);
876			spin_unlock(&shmem_swaplist_lock);
877		}
878		unlock_page(page);
879		return 0;
880	}
881
882	shmem_swp_unmap(entry);
883unlock:
884	spin_unlock(&info->lock);
885	swap_free(swap);
886redirty:
887	set_page_dirty(page);
888	return AOP_WRITEPAGE_ACTIVATE;	/* Return with the page locked */
889}
890
891#ifdef CONFIG_NUMA
892static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
893{
894	char *nodelist = strchr(value, ':');
895	int err = 1;
896
897	if (nodelist) {
898		/* NUL-terminate policy string */
899		*nodelist++ = '\0';
900		if (nodelist_parse(nodelist, *policy_nodes))
901			goto out;
902	}
903	if (!strcmp(value, "default")) {
904		*policy = MPOL_DEFAULT;
905		/* Don't allow a nodelist */
906		if (!nodelist)
907			err = 0;
908	} else if (!strcmp(value, "prefer")) {
909		*policy = MPOL_PREFERRED;
910		/* Insist on a nodelist of one node only */
911		if (nodelist) {
912			char *rest = nodelist;
913			while (isdigit(*rest))
914				rest++;
915			if (!*rest)
916				err = 0;
917		}
918	} else if (!strcmp(value, "bind")) {
919		*policy = MPOL_BIND;
920		/* Insist on a nodelist */
921		if (nodelist)
922			err = 0;
923	} else if (!strcmp(value, "interleave")) {
924		*policy = MPOL_INTERLEAVE;
925		/* Default to nodes online if no nodelist */
926		if (!nodelist)
927			*policy_nodes = node_online_map;
928		err = 0;
929	}
930out:
931	/* Restore string for error message */
932	if (nodelist)
933		*--nodelist = ':';
934	return err;
935}
936
937static struct page *shmem_swapin_async(struct shared_policy *p,
938				       swp_entry_t entry, unsigned long idx)
939{
940	struct page *page;
941	struct vm_area_struct pvma;
942
943	/* Create a pseudo vma that just contains the policy */
944	memset(&pvma, 0, sizeof(struct vm_area_struct));
945	pvma.vm_end = PAGE_SIZE;
946	pvma.vm_pgoff = idx;
947	pvma.vm_policy = mpol_shared_policy_lookup(p, idx);
948	page = read_swap_cache_async(entry, &pvma, 0);
949	mpol_free(pvma.vm_policy);
950	return page;
951}
952
953struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry,
954			  unsigned long idx)
955{
956	struct shared_policy *p = &info->policy;
957	int i, num;
958	struct page *page;
959	unsigned long offset;
960
961	num = valid_swaphandles(entry, &offset);
962	for (i = 0; i < num; offset++, i++) {
963		page = shmem_swapin_async(p,
964				swp_entry(swp_type(entry), offset), idx);
965		if (!page)
966			break;
967		page_cache_release(page);
968	}
969	lru_add_drain();	/* Push any new pages onto the LRU now */
970	return shmem_swapin_async(p, entry, idx);
971}
972
973static struct page *
974shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
975		 unsigned long idx)
976{
977	struct vm_area_struct pvma;
978	struct page *page;
979
980	memset(&pvma, 0, sizeof(struct vm_area_struct));
981	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
982	pvma.vm_pgoff = idx;
983	pvma.vm_end = PAGE_SIZE;
984	page = alloc_page_vma(gfp | __GFP_ZERO, &pvma, 0);
985	mpol_free(pvma.vm_policy);
986	return page;
987}
988#else
989static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
990{
991	return 1;
992}
993
994static inline struct page *
995shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
996{
997	swapin_readahead(entry, 0, NULL);
998	return read_swap_cache_async(entry, NULL, 0);
999}
1000
1001static inline struct page *
1002shmem_alloc_page(gfp_t gfp,struct shmem_inode_info *info, unsigned long idx)
1003{
1004	return alloc_page(gfp | __GFP_ZERO);
1005}
1006#endif
1007
1008/*
1009 * shmem_getpage - either get the page from swap or allocate a new one
1010 *
1011 * If we allocate a new one we do not mark it dirty. That's up to the
1012 * vm. If we swap it in we mark it dirty since we also free the swap
1013 * entry since a page cannot live in both the swap and page cache
1014 */
1015static int shmem_getpage(struct inode *inode, unsigned long idx,
1016			struct page **pagep, enum sgp_type sgp, int *type)
1017{
1018	struct address_space *mapping = inode->i_mapping;
1019	struct shmem_inode_info *info = SHMEM_I(inode);
1020	struct shmem_sb_info *sbinfo;
1021	struct page *filepage = *pagep;
1022	struct page *swappage;
1023	swp_entry_t *entry;
1024	swp_entry_t swap;
1025	int error;
1026
1027	if (idx >= SHMEM_MAX_INDEX)
1028		return -EFBIG;
1029	/*
1030	 * Normally, filepage is NULL on entry, and either found
1031	 * uptodate immediately, or allocated and zeroed, or read
1032	 * in under swappage, which is then assigned to filepage.
1033	 * But shmem_prepare_write passes in a locked filepage,
1034	 * which may be found not uptodate by other callers too,
1035	 * and may need to be copied from the swappage read in.
1036	 */
1037repeat:
1038	if (!filepage)
1039		filepage = find_lock_page(mapping, idx);
1040	if (filepage && PageUptodate(filepage))
1041		goto done;
1042	error = 0;
1043	if (sgp == SGP_QUICK)
1044		goto failed;
1045
1046	spin_lock(&info->lock);
1047	shmem_recalc_inode(inode);
1048	entry = shmem_swp_alloc(info, idx, sgp);
1049	if (IS_ERR(entry)) {
1050		spin_unlock(&info->lock);
1051		error = PTR_ERR(entry);
1052		goto failed;
1053	}
1054	swap = *entry;
1055
1056	if (swap.val) {
1057		/* Look it up and read it in.. */
1058		swappage = lookup_swap_cache(swap);
1059		if (!swappage) {
1060			shmem_swp_unmap(entry);
1061			/* here we actually do the io */
1062			if (type && *type == VM_FAULT_MINOR) {
1063				__count_vm_event(PGMAJFAULT);
1064				*type = VM_FAULT_MAJOR;
1065			}
1066			spin_unlock(&info->lock);
1067			swappage = shmem_swapin(info, swap, idx);
1068			if (!swappage) {
1069				spin_lock(&info->lock);
1070				entry = shmem_swp_alloc(info, idx, sgp);
1071				if (IS_ERR(entry))
1072					error = PTR_ERR(entry);
1073				else {
1074					if (entry->val == swap.val)
1075						error = -ENOMEM;
1076					shmem_swp_unmap(entry);
1077				}
1078				spin_unlock(&info->lock);
1079				if (error)
1080					goto failed;
1081				goto repeat;
1082			}
1083			wait_on_page_locked(swappage);
1084			page_cache_release(swappage);
1085			goto repeat;
1086		}
1087
1088		/* We have to do this with page locked to prevent races */
1089		if (TestSetPageLocked(swappage)) {
1090			shmem_swp_unmap(entry);
1091			spin_unlock(&info->lock);
1092			wait_on_page_locked(swappage);
1093			page_cache_release(swappage);
1094			goto repeat;
1095		}
1096		if (PageWriteback(swappage)) {
1097			shmem_swp_unmap(entry);
1098			spin_unlock(&info->lock);
1099			wait_on_page_writeback(swappage);
1100			unlock_page(swappage);
1101			page_cache_release(swappage);
1102			goto repeat;
1103		}
1104		if (!PageUptodate(swappage)) {
1105			shmem_swp_unmap(entry);
1106			spin_unlock(&info->lock);
1107			unlock_page(swappage);
1108			page_cache_release(swappage);
1109			error = -EIO;
1110			goto failed;
1111		}
1112
1113		if (filepage) {
1114			shmem_swp_set(info, entry, 0);
1115			shmem_swp_unmap(entry);
1116			delete_from_swap_cache(swappage);
1117			spin_unlock(&info->lock);
1118			copy_highpage(filepage, swappage);
1119			unlock_page(swappage);
1120			page_cache_release(swappage);
1121			flush_dcache_page(filepage);
1122			SetPageUptodate(filepage);
1123			set_page_dirty(filepage);
1124			swap_free(swap);
1125		} else if (!(error = move_from_swap_cache(
1126				swappage, idx, mapping))) {
1127			info->flags |= SHMEM_PAGEIN;
1128			shmem_swp_set(info, entry, 0);
1129			shmem_swp_unmap(entry);
1130			spin_unlock(&info->lock);
1131			filepage = swappage;
1132			swap_free(swap);
1133		} else {
1134			shmem_swp_unmap(entry);
1135			spin_unlock(&info->lock);
1136			unlock_page(swappage);
1137			page_cache_release(swappage);
1138			if (error == -ENOMEM) {
1139				/* let kswapd refresh zone for GFP_ATOMICs */
1140				congestion_wait(WRITE, HZ/50);
1141			}
1142			goto repeat;
1143		}
1144	} else if (sgp == SGP_READ && !filepage) {
1145		shmem_swp_unmap(entry);
1146		filepage = find_get_page(mapping, idx);
1147		if (filepage &&
1148		    (!PageUptodate(filepage) || TestSetPageLocked(filepage))) {
1149			spin_unlock(&info->lock);
1150			wait_on_page_locked(filepage);
1151			page_cache_release(filepage);
1152			filepage = NULL;
1153			goto repeat;
1154		}
1155		spin_unlock(&info->lock);
1156	} else {
1157		shmem_swp_unmap(entry);
1158		sbinfo = SHMEM_SB(inode->i_sb);
1159		if (sbinfo->max_blocks) {
1160			spin_lock(&sbinfo->stat_lock);
1161			if (sbinfo->free_blocks == 0 ||
1162			    shmem_acct_block(info->flags)) {
1163				spin_unlock(&sbinfo->stat_lock);
1164				spin_unlock(&info->lock);
1165				error = -ENOSPC;
1166				goto failed;
1167			}
1168			sbinfo->free_blocks--;
1169			inode->i_blocks += BLOCKS_PER_PAGE;
1170			spin_unlock(&sbinfo->stat_lock);
1171		} else if (shmem_acct_block(info->flags)) {
1172			spin_unlock(&info->lock);
1173			error = -ENOSPC;
1174			goto failed;
1175		}
1176
1177		if (!filepage) {
1178			spin_unlock(&info->lock);
1179			filepage = shmem_alloc_page(mapping_gfp_mask(mapping),
1180						    info,
1181						    idx);
1182			if (!filepage) {
1183				shmem_unacct_blocks(info->flags, 1);
1184				shmem_free_blocks(inode, 1);
1185				error = -ENOMEM;
1186				goto failed;
1187			}
1188
1189			spin_lock(&info->lock);
1190			entry = shmem_swp_alloc(info, idx, sgp);
1191			if (IS_ERR(entry))
1192				error = PTR_ERR(entry);
1193			else {
1194				swap = *entry;
1195				shmem_swp_unmap(entry);
1196			}
1197			if (error || swap.val || 0 != add_to_page_cache_lru(
1198					filepage, mapping, idx, GFP_ATOMIC)) {
1199				spin_unlock(&info->lock);
1200				page_cache_release(filepage);
1201				shmem_unacct_blocks(info->flags, 1);
1202				shmem_free_blocks(inode, 1);
1203				filepage = NULL;
1204				if (error)
1205					goto failed;
1206				goto repeat;
1207			}
1208			info->flags |= SHMEM_PAGEIN;
1209		}
1210
1211		info->alloced++;
1212		spin_unlock(&info->lock);
1213		flush_dcache_page(filepage);
1214		SetPageUptodate(filepage);
1215	}
1216done:
1217	if (*pagep != filepage) {
1218		unlock_page(filepage);
1219		*pagep = filepage;
1220	}
1221	return 0;
1222
1223failed:
1224	if (*pagep != filepage) {
1225		unlock_page(filepage);
1226		page_cache_release(filepage);
1227	}
1228	return error;
1229}
1230
1231static struct page *shmem_nopage(struct vm_area_struct *vma,
1232				 unsigned long address, int *type)
1233{
1234	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1235	struct page *page = NULL;
1236	unsigned long idx;
1237	int error;
1238
1239	idx = (address - vma->vm_start) >> PAGE_SHIFT;
1240	idx += vma->vm_pgoff;
1241	idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
1242	if (((loff_t) idx << PAGE_CACHE_SHIFT) >= i_size_read(inode))
1243		return NOPAGE_SIGBUS;
1244
1245	error = shmem_getpage(inode, idx, &page, SGP_CACHE, type);
1246	if (error)
1247		return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS;
1248
1249	mark_page_accessed(page);
1250	return page;
1251}
1252
1253static int shmem_populate(struct vm_area_struct *vma,
1254	unsigned long addr, unsigned long len,
1255	pgprot_t prot, unsigned long pgoff, int nonblock)
1256{
1257	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1258	struct mm_struct *mm = vma->vm_mm;
1259	enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE;
1260	unsigned long size;
1261
1262	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
1263	if (pgoff >= size || pgoff + (len >> PAGE_SHIFT) > size)
1264		return -EINVAL;
1265
1266	while ((long) len > 0) {
1267		struct page *page = NULL;
1268		int err;
1269		/*
1270		 * Will need changing if PAGE_CACHE_SIZE != PAGE_SIZE
1271		 */
1272		err = shmem_getpage(inode, pgoff, &page, sgp, NULL);
1273		if (err)
1274			return err;
1275		/* Page may still be null, but only if nonblock was set. */
1276		if (page) {
1277			mark_page_accessed(page);
1278			err = install_page(mm, vma, addr, page, prot);
1279			if (err) {
1280				page_cache_release(page);
1281				return err;
1282			}
1283		} else if (vma->vm_flags & VM_NONLINEAR) {
1284			/* No page was found just because we can't read it in
1285			 * now (being here implies nonblock != 0), but the page
1286			 * may exist, so set the PTE to fault it in later. */
1287    			err = install_file_pte(mm, vma, addr, pgoff, prot);
1288			if (err)
1289	    			return err;
1290		}
1291
1292		len -= PAGE_SIZE;
1293		addr += PAGE_SIZE;
1294		pgoff++;
1295	}
1296	return 0;
1297}
1298
1299#ifdef CONFIG_NUMA
1300int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
1301{
1302	struct inode *i = vma->vm_file->f_path.dentry->d_inode;
1303	return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
1304}
1305
1306struct mempolicy *
1307shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
1308{
1309	struct inode *i = vma->vm_file->f_path.dentry->d_inode;
1310	unsigned long idx;
1311
1312	idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1313	return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
1314}
1315#endif
1316
1317int shmem_lock(struct file *file, int lock, struct user_struct *user)
1318{
1319	struct inode *inode = file->f_path.dentry->d_inode;
1320	struct shmem_inode_info *info = SHMEM_I(inode);
1321	int retval = -ENOMEM;
1322
1323	spin_lock(&info->lock);
1324	if (lock && !(info->flags & VM_LOCKED)) {
1325		if (!user_shm_lock(inode->i_size, user))
1326			goto out_nomem;
1327		info->flags |= VM_LOCKED;
1328	}
1329	if (!lock && (info->flags & VM_LOCKED) && user) {
1330		user_shm_unlock(inode->i_size, user);
1331		info->flags &= ~VM_LOCKED;
1332	}
1333	retval = 0;
1334out_nomem:
1335	spin_unlock(&info->lock);
1336	return retval;
1337}
1338
1339static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1340{
1341	file_accessed(file);
1342	vma->vm_ops = &shmem_vm_ops;
1343	return 0;
1344}
1345
1346static struct inode *
1347shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1348{
1349	struct inode *inode;
1350	struct shmem_inode_info *info;
1351	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1352
1353	if (sbinfo->max_inodes) {
1354		spin_lock(&sbinfo->stat_lock);
1355		if (!sbinfo->free_inodes) {
1356			spin_unlock(&sbinfo->stat_lock);
1357			return NULL;
1358		}
1359		sbinfo->free_inodes--;
1360		spin_unlock(&sbinfo->stat_lock);
1361	}
1362
1363	inode = new_inode(sb);
1364	if (inode) {
1365		inode->i_mode = mode;
1366		inode->i_uid = current->fsuid;
1367		inode->i_gid = current->fsgid;
1368		inode->i_blocks = 0;
1369		inode->i_mapping->a_ops = &shmem_aops;
1370		inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1371		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1372		inode->i_generation = get_seconds();
1373		info = SHMEM_I(inode);
1374		memset(info, 0, (char *)inode - (char *)info);
1375		spin_lock_init(&info->lock);
1376		INIT_LIST_HEAD(&info->swaplist);
1377
1378		switch (mode & S_IFMT) {
1379		default:
1380			inode->i_op = &shmem_special_inode_operations;
1381			init_special_inode(inode, mode, dev);
1382			break;
1383		case S_IFREG:
1384			inode->i_op = &shmem_inode_operations;
1385			inode->i_fop = &shmem_file_operations;
1386			mpol_shared_policy_init(&info->policy, sbinfo->policy,
1387							&sbinfo->policy_nodes);
1388			break;
1389		case S_IFDIR:
1390			inc_nlink(inode);
1391			/* Some things misbehave if size == 0 on a directory */
1392			inode->i_size = 2 * BOGO_DIRENT_SIZE;
1393			inode->i_op = &shmem_dir_inode_operations;
1394			inode->i_fop = &simple_dir_operations;
1395			break;
1396		case S_IFLNK:
1397			/*
1398			 * Must not load anything in the rbtree,
1399			 * mpol_free_shared_policy will not be called.
1400			 */
1401			mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
1402						NULL);
1403			break;
1404		}
1405	} else if (sbinfo->max_inodes) {
1406		spin_lock(&sbinfo->stat_lock);
1407		sbinfo->free_inodes++;
1408		spin_unlock(&sbinfo->stat_lock);
1409	}
1410	return inode;
1411}
1412
1413#ifdef CONFIG_TMPFS
1414static const struct inode_operations shmem_symlink_inode_operations;
1415static const struct inode_operations shmem_symlink_inline_operations;
1416
1417/*
1418 * Normally tmpfs makes no use of shmem_prepare_write, but it
1419 * lets a tmpfs file be used read-write below the loop driver.
1420 */
1421static int
1422shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
1423{
1424	struct inode *inode = page->mapping->host;
1425	return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL);
1426}
1427
1428static ssize_t
1429shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
1430{
1431	struct inode	*inode = file->f_path.dentry->d_inode;
1432	loff_t		pos;
1433	unsigned long	written;
1434	ssize_t		err;
1435
1436	if ((ssize_t) count < 0)
1437		return -EINVAL;
1438
1439	if (!access_ok(VERIFY_READ, buf, count))
1440		return -EFAULT;
1441
1442	mutex_lock(&inode->i_mutex);
1443
1444	pos = *ppos;
1445	written = 0;
1446
1447	err = generic_write_checks(file, &pos, &count, 0);
1448	if (err || !count)
1449		goto out;
1450
1451	err = remove_suid(file->f_path.dentry);
1452	if (err)
1453		goto out;
1454
1455	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1456
1457	do {
1458		struct page *page = NULL;
1459		unsigned long bytes, index, offset;
1460		char *kaddr;
1461		int left;
1462
1463		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
1464		index = pos >> PAGE_CACHE_SHIFT;
1465		bytes = PAGE_CACHE_SIZE - offset;
1466		if (bytes > count)
1467			bytes = count;
1468
1469		/*
1470		 * We don't hold page lock across copy from user -
1471		 * what would it guard against? - so no deadlock here.
1472		 * But it still may be a good idea to prefault below.
1473		 */
1474
1475		err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL);
1476		if (err)
1477			break;
1478
1479		left = bytes;
1480		if (PageHighMem(page)) {
1481			volatile unsigned char dummy;
1482			__get_user(dummy, buf);
1483			__get_user(dummy, buf + bytes - 1);
1484
1485			kaddr = kmap_atomic(page, KM_USER0);
1486			left = __copy_from_user_inatomic(kaddr + offset,
1487							buf, bytes);
1488			kunmap_atomic(kaddr, KM_USER0);
1489		}
1490		if (left) {
1491			kaddr = kmap(page);
1492			left = __copy_from_user(kaddr + offset, buf, bytes);
1493			kunmap(page);
1494		}
1495
1496		written += bytes;
1497		count -= bytes;
1498		pos += bytes;
1499		buf += bytes;
1500		if (pos > inode->i_size)
1501			i_size_write(inode, pos);
1502
1503		flush_dcache_page(page);
1504		set_page_dirty(page);
1505		mark_page_accessed(page);
1506		page_cache_release(page);
1507
1508		if (left) {
1509			pos -= left;
1510			written -= left;
1511			err = -EFAULT;
1512			break;
1513		}
1514
1515		/*
1516		 * Our dirty pages are not counted in nr_dirty,
1517		 * and we do not attempt to balance dirty pages.
1518		 */
1519
1520		cond_resched();
1521	} while (count);
1522
1523	*ppos = pos;
1524	if (written)
1525		err = written;
1526out:
1527	mutex_unlock(&inode->i_mutex);
1528	return err;
1529}
1530
1531static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
1532{
1533	struct inode *inode = filp->f_path.dentry->d_inode;
1534	struct address_space *mapping = inode->i_mapping;
1535	unsigned long index, offset;
1536
1537	index = *ppos >> PAGE_CACHE_SHIFT;
1538	offset = *ppos & ~PAGE_CACHE_MASK;
1539
1540	for (;;) {
1541		struct page *page = NULL;
1542		unsigned long end_index, nr, ret;
1543		loff_t i_size = i_size_read(inode);
1544
1545		end_index = i_size >> PAGE_CACHE_SHIFT;
1546		if (index > end_index)
1547			break;
1548		if (index == end_index) {
1549			nr = i_size & ~PAGE_CACHE_MASK;
1550			if (nr <= offset)
1551				break;
1552		}
1553
1554		desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL);
1555		if (desc->error) {
1556			if (desc->error == -EINVAL)
1557				desc->error = 0;
1558			break;
1559		}
1560
1561		/*
1562		 * We must evaluate after, since reads (unlike writes)
1563		 * are called without i_mutex protection against truncate
1564		 */
1565		nr = PAGE_CACHE_SIZE;
1566		i_size = i_size_read(inode);
1567		end_index = i_size >> PAGE_CACHE_SHIFT;
1568		if (index == end_index) {
1569			nr = i_size & ~PAGE_CACHE_MASK;
1570			if (nr <= offset) {
1571				if (page)
1572					page_cache_release(page);
1573				break;
1574			}
1575		}
1576		nr -= offset;
1577
1578		if (page) {
1579			/*
1580			 * If users can be writing to this page using arbitrary
1581			 * virtual addresses, take care about potential aliasing
1582			 * before reading the page on the kernel side.
1583			 */
1584			if (mapping_writably_mapped(mapping))
1585				flush_dcache_page(page);
1586			/*
1587			 * Mark the page accessed if we read the beginning.
1588			 */
1589			if (!offset)
1590				mark_page_accessed(page);
1591		} else {
1592			page = ZERO_PAGE(0);
1593			page_cache_get(page);
1594		}
1595
1596		/*
1597		 * Ok, we have the page, and it's up-to-date, so
1598		 * now we can copy it to user space...
1599		 *
1600		 * The actor routine returns how many bytes were actually used..
1601		 * NOTE! This may not be the same as how much of a user buffer
1602		 * we filled up (we may be padding etc), so we can only update
1603		 * "pos" here (the actor routine has to update the user buffer
1604		 * pointers and the remaining count).
1605		 */
1606		ret = actor(desc, page, offset, nr);
1607		offset += ret;
1608		index += offset >> PAGE_CACHE_SHIFT;
1609		offset &= ~PAGE_CACHE_MASK;
1610
1611		page_cache_release(page);
1612		if (ret != nr || !desc->count)
1613			break;
1614
1615		cond_resched();
1616	}
1617
1618	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1619	file_accessed(filp);
1620}
1621
1622static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
1623{
1624	read_descriptor_t desc;
1625
1626	if ((ssize_t) count < 0)
1627		return -EINVAL;
1628	if (!access_ok(VERIFY_WRITE, buf, count))
1629		return -EFAULT;
1630	if (!count)
1631		return 0;
1632
1633	desc.written = 0;
1634	desc.count = count;
1635	desc.arg.buf = buf;
1636	desc.error = 0;
1637
1638	do_shmem_file_read(filp, ppos, &desc, file_read_actor);
1639	if (desc.written)
1640		return desc.written;
1641	return desc.error;
1642}
1643
1644static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
1645			 size_t count, read_actor_t actor, void *target)
1646{
1647	read_descriptor_t desc;
1648
1649	if (!count)
1650		return 0;
1651
1652	desc.written = 0;
1653	desc.count = count;
1654	desc.arg.data = target;
1655	desc.error = 0;
1656
1657	do_shmem_file_read(in_file, ppos, &desc, actor);
1658	if (desc.written)
1659		return desc.written;
1660	return desc.error;
1661}
1662
1663static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1664{
1665	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
1666
1667	buf->f_type = TMPFS_MAGIC;
1668	buf->f_bsize = PAGE_CACHE_SIZE;
1669	buf->f_namelen = NAME_MAX;
1670	spin_lock(&sbinfo->stat_lock);
1671	if (sbinfo->max_blocks) {
1672		buf->f_blocks = sbinfo->max_blocks;
1673		buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
1674	}
1675	if (sbinfo->max_inodes) {
1676		buf->f_files = sbinfo->max_inodes;
1677		buf->f_ffree = sbinfo->free_inodes;
1678	}
1679	/* else leave those fields 0 like simple_statfs */
1680	spin_unlock(&sbinfo->stat_lock);
1681	return 0;
1682}
1683
1684/*
1685 * File creation. Allocate an inode, and we're done..
1686 */
1687static int
1688shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1689{
1690	struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev);
1691	int error = -ENOSPC;
1692
1693	if (inode) {
1694		error = security_inode_init_security(inode, dir, NULL, NULL,
1695						     NULL);
1696		if (error) {
1697			if (error != -EOPNOTSUPP) {
1698				iput(inode);
1699				return error;
1700			}
1701		}
1702		error = shmem_acl_init(inode, dir);
1703		if (error) {
1704			iput(inode);
1705			return error;
1706		}
1707		if (dir->i_mode & S_ISGID) {
1708			inode->i_gid = dir->i_gid;
1709			if (S_ISDIR(mode))
1710				inode->i_mode |= S_ISGID;
1711		}
1712		dir->i_size += BOGO_DIRENT_SIZE;
1713		dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1714		d_instantiate(dentry, inode);
1715		dget(dentry); /* Extra count - pin the dentry in core */
1716	}
1717	return error;
1718}
1719
1720static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1721{
1722	int error;
1723
1724	if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
1725		return error;
1726	inc_nlink(dir);
1727	return 0;
1728}
1729
1730static int shmem_create(struct inode *dir, struct dentry *dentry, int mode,
1731		struct nameidata *nd)
1732{
1733	return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
1734}
1735
1736/*
1737 * Link a file..
1738 */
1739static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1740{
1741	struct inode *inode = old_dentry->d_inode;
1742	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1743
1744	/*
1745	 * No ordinary (disk based) filesystem counts links as inodes;
1746	 * but each new link needs a new dentry, pinning lowmem, and
1747	 * tmpfs dentries cannot be pruned until they are unlinked.
1748	 */
1749	if (sbinfo->max_inodes) {
1750		spin_lock(&sbinfo->stat_lock);
1751		if (!sbinfo->free_inodes) {
1752			spin_unlock(&sbinfo->stat_lock);
1753			return -ENOSPC;
1754		}
1755		sbinfo->free_inodes--;
1756		spin_unlock(&sbinfo->stat_lock);
1757	}
1758
1759	dir->i_size += BOGO_DIRENT_SIZE;
1760	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1761	inc_nlink(inode);
1762	atomic_inc(&inode->i_count);	/* New dentry reference */
1763	dget(dentry);		/* Extra pinning count for the created dentry */
1764	d_instantiate(dentry, inode);
1765	return 0;
1766}
1767
1768static int shmem_unlink(struct inode *dir, struct dentry *dentry)
1769{
1770	struct inode *inode = dentry->d_inode;
1771
1772	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) {
1773		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1774		if (sbinfo->max_inodes) {
1775			spin_lock(&sbinfo->stat_lock);
1776			sbinfo->free_inodes++;
1777			spin_unlock(&sbinfo->stat_lock);
1778		}
1779	}
1780
1781	dir->i_size -= BOGO_DIRENT_SIZE;
1782	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1783	drop_nlink(inode);
1784	dput(dentry);	/* Undo the count from "create" - this does all the work */
1785	return 0;
1786}
1787
1788static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
1789{
1790	if (!simple_empty(dentry))
1791		return -ENOTEMPTY;
1792
1793	drop_nlink(dentry->d_inode);
1794	drop_nlink(dir);
1795	return shmem_unlink(dir, dentry);
1796}
1797
1798/*
1799 * The VFS layer already does all the dentry stuff for rename,
1800 * we just have to decrement the usage count for the target if
1801 * it exists so that the VFS layer correctly free's it when it
1802 * gets overwritten.
1803 */
1804static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
1805{
1806	struct inode *inode = old_dentry->d_inode;
1807	int they_are_dirs = S_ISDIR(inode->i_mode);
1808
1809	if (!simple_empty(new_dentry))
1810		return -ENOTEMPTY;
1811
1812	if (new_dentry->d_inode) {
1813		(void) shmem_unlink(new_dir, new_dentry);
1814		if (they_are_dirs)
1815			drop_nlink(old_dir);
1816	} else if (they_are_dirs) {
1817		drop_nlink(old_dir);
1818		inc_nlink(new_dir);
1819	}
1820
1821	old_dir->i_size -= BOGO_DIRENT_SIZE;
1822	new_dir->i_size += BOGO_DIRENT_SIZE;
1823	old_dir->i_ctime = old_dir->i_mtime =
1824	new_dir->i_ctime = new_dir->i_mtime =
1825	inode->i_ctime = CURRENT_TIME;
1826	return 0;
1827}
1828
1829static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1830{
1831	int error;
1832	int len;
1833	struct inode *inode;
1834	struct page *page = NULL;
1835	char *kaddr;
1836	struct shmem_inode_info *info;
1837
1838	len = strlen(symname) + 1;
1839	if (len > PAGE_CACHE_SIZE)
1840		return -ENAMETOOLONG;
1841
1842	inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
1843	if (!inode)
1844		return -ENOSPC;
1845
1846	error = security_inode_init_security(inode, dir, NULL, NULL,
1847					     NULL);
1848	if (error) {
1849		if (error != -EOPNOTSUPP) {
1850			iput(inode);
1851			return error;
1852		}
1853		error = 0;
1854	}
1855
1856	info = SHMEM_I(inode);
1857	inode->i_size = len-1;
1858	if (len <= (char *)inode - (char *)info) {
1859		/* do it inline */
1860		memcpy(info, symname, len);
1861		inode->i_op = &shmem_symlink_inline_operations;
1862	} else {
1863		error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
1864		if (error) {
1865			iput(inode);
1866			return error;
1867		}
1868		inode->i_op = &shmem_symlink_inode_operations;
1869		kaddr = kmap_atomic(page, KM_USER0);
1870		memcpy(kaddr, symname, len);
1871		kunmap_atomic(kaddr, KM_USER0);
1872		set_page_dirty(page);
1873		page_cache_release(page);
1874	}
1875	if (dir->i_mode & S_ISGID)
1876		inode->i_gid = dir->i_gid;
1877	dir->i_size += BOGO_DIRENT_SIZE;
1878	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1879	d_instantiate(dentry, inode);
1880	dget(dentry);
1881	return 0;
1882}
1883
1884static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
1885{
1886	nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode));
1887	return NULL;
1888}
1889
1890static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
1891{
1892	struct page *page = NULL;
1893	int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
1894	nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
1895	return page;
1896}
1897
1898static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
1899{
1900	if (!IS_ERR(nd_get_link(nd))) {
1901		struct page *page = cookie;
1902		kunmap(page);
1903		mark_page_accessed(page);
1904		page_cache_release(page);
1905	}
1906}
1907
1908static const struct inode_operations shmem_symlink_inline_operations = {
1909	.readlink	= generic_readlink,
1910	.follow_link	= shmem_follow_link_inline,
1911};
1912
1913static const struct inode_operations shmem_symlink_inode_operations = {
1914	.truncate	= shmem_truncate,
1915	.readlink	= generic_readlink,
1916	.follow_link	= shmem_follow_link,
1917	.put_link	= shmem_put_link,
1918};
1919
1920#ifdef CONFIG_TMPFS_POSIX_ACL
1921/**
1922 * Superblocks without xattr inode operations will get security.* xattr
1923 * support from the VFS "for free". As soon as we have any other xattrs
1924 * like ACLs, we also need to implement the security.* handlers at
1925 * filesystem level, though.
1926 */
1927
1928static size_t shmem_xattr_security_list(struct inode *inode, char *list,
1929					size_t list_len, const char *name,
1930					size_t name_len)
1931{
1932	return security_inode_listsecurity(inode, list, list_len);
1933}
1934
1935static int shmem_xattr_security_get(struct inode *inode, const char *name,
1936				    void *buffer, size_t size)
1937{
1938	if (strcmp(name, "") == 0)
1939		return -EINVAL;
1940	return security_inode_getsecurity(inode, name, buffer, size,
1941					  -EOPNOTSUPP);
1942}
1943
1944static int shmem_xattr_security_set(struct inode *inode, const char *name,
1945				    const void *value, size_t size, int flags)
1946{
1947	if (strcmp(name, "") == 0)
1948		return -EINVAL;
1949	return security_inode_setsecurity(inode, name, value, size, flags);
1950}
1951
1952static struct xattr_handler shmem_xattr_security_handler = {
1953	.prefix = XATTR_SECURITY_PREFIX,
1954	.list   = shmem_xattr_security_list,
1955	.get    = shmem_xattr_security_get,
1956	.set    = shmem_xattr_security_set,
1957};
1958
1959static struct xattr_handler *shmem_xattr_handlers[] = {
1960	&shmem_xattr_acl_access_handler,
1961	&shmem_xattr_acl_default_handler,
1962	&shmem_xattr_security_handler,
1963	NULL
1964};
1965#endif
1966
1967static struct dentry *shmem_get_parent(struct dentry *child)
1968{
1969	return ERR_PTR(-ESTALE);
1970}
1971
1972static int shmem_match(struct inode *ino, void *vfh)
1973{
1974	__u32 *fh = vfh;
1975	__u64 inum = fh[2];
1976	inum = (inum << 32) | fh[1];
1977	return ino->i_ino == inum && fh[0] == ino->i_generation;
1978}
1979
1980static struct dentry *shmem_get_dentry(struct super_block *sb, void *vfh)
1981{
1982	struct dentry *de = NULL;
1983	struct inode *inode;
1984	__u32 *fh = vfh;
1985	__u64 inum = fh[2];
1986	inum = (inum << 32) | fh[1];
1987
1988	inode = ilookup5(sb, (unsigned long)(inum+fh[0]), shmem_match, vfh);
1989	if (inode) {
1990		de = d_find_alias(inode);
1991		iput(inode);
1992	}
1993
1994	return de? de: ERR_PTR(-ESTALE);
1995}
1996
1997static struct dentry *shmem_decode_fh(struct super_block *sb, __u32 *fh,
1998		int len, int type,
1999		int (*acceptable)(void *context, struct dentry *de),
2000		void *context)
2001{
2002	if (len < 3)
2003		return ERR_PTR(-ESTALE);
2004
2005	return sb->s_export_op->find_exported_dentry(sb, fh, NULL, acceptable,
2006							context);
2007}
2008
2009static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
2010				int connectable)
2011{
2012	struct inode *inode = dentry->d_inode;
2013
2014	if (*len < 3)
2015		return 255;
2016
2017	if (hlist_unhashed(&inode->i_hash)) {
2018		/* Unfortunately insert_inode_hash is not idempotent,
2019		 * so as we hash inodes here rather than at creation
2020		 * time, we need a lock to ensure we only try
2021		 * to do it once
2022		 */
2023		static DEFINE_SPINLOCK(lock);
2024		spin_lock(&lock);
2025		if (hlist_unhashed(&inode->i_hash))
2026			__insert_inode_hash(inode,
2027					    inode->i_ino + inode->i_generation);
2028		spin_unlock(&lock);
2029	}
2030
2031	fh[0] = inode->i_generation;
2032	fh[1] = inode->i_ino;
2033	fh[2] = ((__u64)inode->i_ino) >> 32;
2034
2035	*len = 3;
2036	return 1;
2037}
2038
2039static struct export_operations shmem_export_ops = {
2040	.get_parent     = shmem_get_parent,
2041	.get_dentry     = shmem_get_dentry,
2042	.encode_fh      = shmem_encode_fh,
2043	.decode_fh      = shmem_decode_fh,
2044};
2045
2046static int shmem_parse_options(char *options, int *mode, uid_t *uid,
2047	gid_t *gid, unsigned long *blocks, unsigned long *inodes,
2048	int *policy, nodemask_t *policy_nodes)
2049{
2050	char *this_char, *value, *rest;
2051
2052	while (options != NULL) {
2053		this_char = options;
2054		for (;;) {
2055			/*
2056			 * NUL-terminate this option: unfortunately,
2057			 * mount options form a comma-separated list,
2058			 * but mpol's nodelist may also contain commas.
2059			 */
2060			options = strchr(options, ',');
2061			if (options == NULL)
2062				break;
2063			options++;
2064			if (!isdigit(*options)) {
2065				options[-1] = '\0';
2066				break;
2067			}
2068		}
2069		if (!*this_char)
2070			continue;
2071		if ((value = strchr(this_char,'=')) != NULL) {
2072			*value++ = 0;
2073		} else {
2074			printk(KERN_ERR
2075			    "tmpfs: No value for mount option '%s'\n",
2076			    this_char);
2077			return 1;
2078		}
2079
2080		if (!strcmp(this_char,"size")) {
2081			unsigned long long size;
2082			size = memparse(value,&rest);
2083			if (*rest == '%') {
2084				size <<= PAGE_SHIFT;
2085				size *= totalram_pages;
2086				do_div(size, 100);
2087				rest++;
2088			}
2089			if (*rest)
2090				goto bad_val;
2091			*blocks = size >> PAGE_CACHE_SHIFT;
2092		} else if (!strcmp(this_char,"nr_blocks")) {
2093			*blocks = memparse(value,&rest);
2094			if (*rest)
2095				goto bad_val;
2096		} else if (!strcmp(this_char,"nr_inodes")) {
2097			*inodes = memparse(value,&rest);
2098			if (*rest)
2099				goto bad_val;
2100		} else if (!strcmp(this_char,"mode")) {
2101			if (!mode)
2102				continue;
2103			*mode = simple_strtoul(value,&rest,8);
2104			if (*rest)
2105				goto bad_val;
2106		} else if (!strcmp(this_char,"uid")) {
2107			if (!uid)
2108				continue;
2109			*uid = simple_strtoul(value,&rest,0);
2110			if (*rest)
2111				goto bad_val;
2112		} else if (!strcmp(this_char,"gid")) {
2113			if (!gid)
2114				continue;
2115			*gid = simple_strtoul(value,&rest,0);
2116			if (*rest)
2117				goto bad_val;
2118		} else if (!strcmp(this_char,"mpol")) {
2119			if (shmem_parse_mpol(value,policy,policy_nodes))
2120				goto bad_val;
2121		} else {
2122			printk(KERN_ERR "tmpfs: Bad mount option %s\n",
2123			       this_char);
2124			return 1;
2125		}
2126	}
2127	return 0;
2128
2129bad_val:
2130	printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
2131	       value, this_char);
2132	return 1;
2133
2134}
2135
2136static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2137{
2138	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2139	unsigned long max_blocks = sbinfo->max_blocks;
2140	unsigned long max_inodes = sbinfo->max_inodes;
2141	int policy = sbinfo->policy;
2142	nodemask_t policy_nodes = sbinfo->policy_nodes;
2143	unsigned long blocks;
2144	unsigned long inodes;
2145	int error = -EINVAL;
2146
2147	if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks,
2148				&max_inodes, &policy, &policy_nodes))
2149		return error;
2150
2151	spin_lock(&sbinfo->stat_lock);
2152	blocks = sbinfo->max_blocks - sbinfo->free_blocks;
2153	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
2154	if (max_blocks < blocks)
2155		goto out;
2156	if (max_inodes < inodes)
2157		goto out;
2158	/*
2159	 * Those tests also disallow limited->unlimited while any are in
2160	 * use, so i_blocks will always be zero when max_blocks is zero;
2161	 * but we must separately disallow unlimited->limited, because
2162	 * in that case we have no record of how much is already in use.
2163	 */
2164	if (max_blocks && !sbinfo->max_blocks)
2165		goto out;
2166	if (max_inodes && !sbinfo->max_inodes)
2167		goto out;
2168
2169	error = 0;
2170	sbinfo->max_blocks  = max_blocks;
2171	sbinfo->free_blocks = max_blocks - blocks;
2172	sbinfo->max_inodes  = max_inodes;
2173	sbinfo->free_inodes = max_inodes - inodes;
2174	sbinfo->policy = policy;
2175	sbinfo->policy_nodes = policy_nodes;
2176out:
2177	spin_unlock(&sbinfo->stat_lock);
2178	return error;
2179}
2180#endif
2181
2182static void shmem_put_super(struct super_block *sb)
2183{
2184	kfree(sb->s_fs_info);
2185	sb->s_fs_info = NULL;
2186}
2187
2188static int shmem_fill_super(struct super_block *sb,
2189			    void *data, int silent)
2190{
2191	struct inode *inode;
2192	struct dentry *root;
2193	int mode   = S_IRWXUGO | S_ISVTX;
2194	uid_t uid = current->fsuid;
2195	gid_t gid = current->fsgid;
2196	int err = -ENOMEM;
2197	struct shmem_sb_info *sbinfo;
2198	unsigned long blocks = 0;
2199	unsigned long inodes = 0;
2200	int policy = MPOL_DEFAULT;
2201	nodemask_t policy_nodes = node_online_map;
2202
2203#ifdef CONFIG_TMPFS
2204	/*
2205	 * Per default we only allow half of the physical ram per
2206	 * tmpfs instance, limiting inodes to one per page of lowmem;
2207	 * but the internal instance is left unlimited.
2208	 */
2209	if (!(sb->s_flags & MS_NOUSER)) {
2210		blocks = totalram_pages / 2;
2211		inodes = totalram_pages - totalhigh_pages;
2212		if (inodes > blocks)
2213			inodes = blocks;
2214		if (shmem_parse_options(data, &mode, &uid, &gid, &blocks,
2215					&inodes, &policy, &policy_nodes))
2216			return -EINVAL;
2217	}
2218	sb->s_export_op = &shmem_export_ops;
2219#else
2220	sb->s_flags |= MS_NOUSER;
2221#endif
2222
2223	/* Round up to L1_CACHE_BYTES to resist false sharing */
2224	sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info),
2225				L1_CACHE_BYTES), GFP_KERNEL);
2226	if (!sbinfo)
2227		return -ENOMEM;
2228
2229	spin_lock_init(&sbinfo->stat_lock);
2230	sbinfo->max_blocks = blocks;
2231	sbinfo->free_blocks = blocks;
2232	sbinfo->max_inodes = inodes;
2233	sbinfo->free_inodes = inodes;
2234	sbinfo->policy = policy;
2235	sbinfo->policy_nodes = policy_nodes;
2236
2237	sb->s_fs_info = sbinfo;
2238	sb->s_maxbytes = SHMEM_MAX_BYTES;
2239	sb->s_blocksize = PAGE_CACHE_SIZE;
2240	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
2241	sb->s_magic = TMPFS_MAGIC;
2242	sb->s_op = &shmem_ops;
2243	sb->s_time_gran = 1;
2244#ifdef CONFIG_TMPFS_POSIX_ACL
2245	sb->s_xattr = shmem_xattr_handlers;
2246	sb->s_flags |= MS_POSIXACL;
2247#endif
2248
2249	inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
2250	if (!inode)
2251		goto failed;
2252	inode->i_uid = uid;
2253	inode->i_gid = gid;
2254	root = d_alloc_root(inode);
2255	if (!root)
2256		goto failed_iput;
2257	sb->s_root = root;
2258	return 0;
2259
2260failed_iput:
2261	iput(inode);
2262failed:
2263	shmem_put_super(sb);
2264	return err;
2265}
2266
2267static struct kmem_cache *shmem_inode_cachep;
2268
2269static struct inode *shmem_alloc_inode(struct super_block *sb)
2270{
2271	struct shmem_inode_info *p;
2272	p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
2273	if (!p)
2274		return NULL;
2275	return &p->vfs_inode;
2276}
2277
2278static void shmem_destroy_inode(struct inode *inode)
2279{
2280	if ((inode->i_mode & S_IFMT) == S_IFREG) {
2281		/* only struct inode is valid if it's an inline symlink */
2282		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2283	}
2284	shmem_acl_destroy_inode(inode);
2285	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2286}
2287
2288static void init_once(void *foo, struct kmem_cache *cachep,
2289		      unsigned long flags)
2290{
2291	struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2292
2293	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
2294	    SLAB_CTOR_CONSTRUCTOR) {
2295		inode_init_once(&p->vfs_inode);
2296#ifdef CONFIG_TMPFS_POSIX_ACL
2297		p->i_acl = NULL;
2298		p->i_default_acl = NULL;
2299#endif
2300	}
2301}
2302
2303static int init_inodecache(void)
2304{
2305	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
2306				sizeof(struct shmem_inode_info),
2307				0, 0, init_once, NULL);
2308	if (shmem_inode_cachep == NULL)
2309		return -ENOMEM;
2310	return 0;
2311}
2312
2313static void destroy_inodecache(void)
2314{
2315	kmem_cache_destroy(shmem_inode_cachep);
2316}
2317
2318static const struct address_space_operations shmem_aops = {
2319	.writepage	= shmem_writepage,
2320	.set_page_dirty	= __set_page_dirty_no_writeback,
2321#ifdef CONFIG_TMPFS
2322	.prepare_write	= shmem_prepare_write,
2323	.commit_write	= simple_commit_write,
2324#endif
2325	.migratepage	= migrate_page,
2326};
2327
2328static const struct file_operations shmem_file_operations = {
2329	.mmap		= shmem_mmap,
2330#ifdef CONFIG_TMPFS
2331	.llseek		= generic_file_llseek,
2332	.read		= shmem_file_read,
2333	.write		= shmem_file_write,
2334	.fsync		= simple_sync_file,
2335	.sendfile	= shmem_file_sendfile,
2336#endif
2337};
2338
2339static const struct inode_operations shmem_inode_operations = {
2340	.truncate	= shmem_truncate,
2341	.setattr	= shmem_notify_change,
2342	.truncate_range	= shmem_truncate_range,
2343#ifdef CONFIG_TMPFS_POSIX_ACL
2344	.setxattr	= generic_setxattr,
2345	.getxattr	= generic_getxattr,
2346	.listxattr	= generic_listxattr,
2347	.removexattr	= generic_removexattr,
2348	.permission	= shmem_permission,
2349#endif
2350
2351};
2352
2353static const struct inode_operations shmem_dir_inode_operations = {
2354#ifdef CONFIG_TMPFS
2355	.create		= shmem_create,
2356	.lookup		= simple_lookup,
2357	.link		= shmem_link,
2358	.unlink		= shmem_unlink,
2359	.symlink	= shmem_symlink,
2360	.mkdir		= shmem_mkdir,
2361	.rmdir		= shmem_rmdir,
2362	.mknod		= shmem_mknod,
2363	.rename		= shmem_rename,
2364#endif
2365#ifdef CONFIG_TMPFS_POSIX_ACL
2366	.setattr	= shmem_notify_change,
2367	.setxattr	= generic_setxattr,
2368	.getxattr	= generic_getxattr,
2369	.listxattr	= generic_listxattr,
2370	.removexattr	= generic_removexattr,
2371	.permission	= shmem_permission,
2372#endif
2373};
2374
2375static const struct inode_operations shmem_special_inode_operations = {
2376#ifdef CONFIG_TMPFS_POSIX_ACL
2377	.setattr	= shmem_notify_change,
2378	.setxattr	= generic_setxattr,
2379	.getxattr	= generic_getxattr,
2380	.listxattr	= generic_listxattr,
2381	.removexattr	= generic_removexattr,
2382	.permission	= shmem_permission,
2383#endif
2384};
2385
2386static const struct super_operations shmem_ops = {
2387	.alloc_inode	= shmem_alloc_inode,
2388	.destroy_inode	= shmem_destroy_inode,
2389#ifdef CONFIG_TMPFS
2390	.statfs		= shmem_statfs,
2391	.remount_fs	= shmem_remount_fs,
2392#endif
2393	.delete_inode	= shmem_delete_inode,
2394	.drop_inode	= generic_delete_inode,
2395	.put_super	= shmem_put_super,
2396};
2397
2398static struct vm_operations_struct shmem_vm_ops = {
2399	.nopage		= shmem_nopage,
2400	.populate	= shmem_populate,
2401#ifdef CONFIG_NUMA
2402	.set_policy     = shmem_set_policy,
2403	.get_policy     = shmem_get_policy,
2404#endif
2405};
2406
2407
2408static int shmem_get_sb(struct file_system_type *fs_type,
2409	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
2410{
2411	return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
2412}
2413
2414static struct file_system_type tmpfs_fs_type = {
2415	.owner		= THIS_MODULE,
2416	.name		= "tmpfs",
2417	.get_sb		= shmem_get_sb,
2418	.kill_sb	= kill_litter_super,
2419};
2420static struct vfsmount *shm_mnt;
2421
2422static int __init init_tmpfs(void)
2423{
2424	int error;
2425
2426	error = init_inodecache();
2427	if (error)
2428		goto out3;
2429
2430	error = register_filesystem(&tmpfs_fs_type);
2431	if (error) {
2432		printk(KERN_ERR "Could not register tmpfs\n");
2433		goto out2;
2434	}
2435
2436	shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER,
2437				tmpfs_fs_type.name, NULL);
2438	if (IS_ERR(shm_mnt)) {
2439		error = PTR_ERR(shm_mnt);
2440		printk(KERN_ERR "Could not kern_mount tmpfs\n");
2441		goto out1;
2442	}
2443	return 0;
2444
2445out1:
2446	unregister_filesystem(&tmpfs_fs_type);
2447out2:
2448	destroy_inodecache();
2449out3:
2450	shm_mnt = ERR_PTR(error);
2451	return error;
2452}
2453module_init(init_tmpfs)
2454
2455/*
2456 * shmem_file_setup - get an unlinked file living in tmpfs
2457 *
2458 * @name: name for dentry (to be seen in /proc/<pid>/maps
2459 * @size: size to be set for the file
2460 *
2461 */
2462struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
2463{
2464	int error;
2465	struct file *file;
2466	struct inode *inode;
2467	struct dentry *dentry, *root;
2468	struct qstr this;
2469
2470	if (IS_ERR(shm_mnt))
2471		return (void *)shm_mnt;
2472
2473	if (size < 0 || size > SHMEM_MAX_BYTES)
2474		return ERR_PTR(-EINVAL);
2475
2476	if (shmem_acct_size(flags, size))
2477		return ERR_PTR(-ENOMEM);
2478
2479	error = -ENOMEM;
2480	this.name = name;
2481	this.len = strlen(name);
2482	this.hash = 0; /* will go */
2483	root = shm_mnt->mnt_root;
2484	dentry = d_alloc(root, &this);
2485	if (!dentry)
2486		goto put_memory;
2487
2488	error = -ENFILE;
2489	file = get_empty_filp();
2490	if (!file)
2491		goto put_dentry;
2492
2493	error = -ENOSPC;
2494	inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
2495	if (!inode)
2496		goto close_file;
2497
2498	SHMEM_I(inode)->flags = flags & VM_ACCOUNT;
2499	d_instantiate(dentry, inode);
2500	inode->i_size = size;
2501	inode->i_nlink = 0;	/* It is unlinked */
2502	file->f_path.mnt = mntget(shm_mnt);
2503	file->f_path.dentry = dentry;
2504	file->f_mapping = inode->i_mapping;
2505	file->f_op = &shmem_file_operations;
2506	file->f_mode = FMODE_WRITE | FMODE_READ;
2507	return file;
2508
2509close_file:
2510	put_filp(file);
2511put_dentry:
2512	dput(dentry);
2513put_memory:
2514	shmem_unacct_size(flags, size);
2515	return ERR_PTR(error);
2516}
2517
2518/*
2519 * shmem_zero_setup - setup a shared anonymous mapping
2520 *
2521 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
2522 */
2523int shmem_zero_setup(struct vm_area_struct *vma)
2524{
2525	struct file *file;
2526	loff_t size = vma->vm_end - vma->vm_start;
2527
2528	file = shmem_file_setup("dev/zero", size, vma->vm_flags);
2529	if (IS_ERR(file))
2530		return PTR_ERR(file);
2531
2532	if (vma->vm_file)
2533		fput(vma->vm_file);
2534	vma->vm_file = file;
2535	vma->vm_ops = &shmem_vm_ops;
2536	return 0;
2537}
2538