1/*
2 * mm/mmap.c
3 *
4 * Written by obz.
5 *
6 * Address space accounting code	<alan@lxorguk.ukuu.org.uk>
7 */
8
9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10
11#include <linux/kernel.h>
12#include <linux/slab.h>
13#include <linux/backing-dev.h>
14#include <linux/mm.h>
15#include <linux/vmacache.h>
16#include <linux/shm.h>
17#include <linux/mman.h>
18#include <linux/pagemap.h>
19#include <linux/swap.h>
20#include <linux/syscalls.h>
21#include <linux/capability.h>
22#include <linux/init.h>
23#include <linux/file.h>
24#include <linux/fs.h>
25#include <linux/personality.h>
26#include <linux/security.h>
27#include <linux/hugetlb.h>
28#include <linux/profile.h>
29#include <linux/export.h>
30#include <linux/mount.h>
31#include <linux/mempolicy.h>
32#include <linux/rmap.h>
33#include <linux/mmu_notifier.h>
34#include <linux/mmdebug.h>
35#include <linux/perf_event.h>
36#include <linux/audit.h>
37#include <linux/khugepaged.h>
38#include <linux/uprobes.h>
39#include <linux/rbtree_augmented.h>
40#include <linux/sched/sysctl.h>
41#include <linux/notifier.h>
42#include <linux/memory.h>
43#include <linux/printk.h>
44
45#include <asm/uaccess.h>
46#include <asm/cacheflush.h>
47#include <asm/tlb.h>
48#include <asm/mmu_context.h>
49
50#include "internal.h"
51
52#ifndef arch_mmap_check
53#define arch_mmap_check(addr, len, flags)	(0)
54#endif
55
56#ifndef arch_rebalance_pgtables
57#define arch_rebalance_pgtables(addr, len)		(addr)
58#endif
59
60static void unmap_region(struct mm_struct *mm,
61		struct vm_area_struct *vma, struct vm_area_struct *prev,
62		unsigned long start, unsigned long end);
63
64/* description of effects of mapping type and prot in current implementation.
65 * this is due to the limited x86 page protection hardware.  The expected
66 * behavior is in parens:
67 *
68 * map_type	prot
69 *		PROT_NONE	PROT_READ	PROT_WRITE	PROT_EXEC
70 * MAP_SHARED	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
71 *		w: (no) no	w: (no) no	w: (yes) yes	w: (no) no
72 *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
73 *
74 * MAP_PRIVATE	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
75 *		w: (no) no	w: (no) no	w: (copy) copy	w: (no) no
76 *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
77 *
78 */
79pgprot_t protection_map[16] = {
80	__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
81	__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
82};
83
84pgprot_t vm_get_page_prot(unsigned long vm_flags)
85{
86	return __pgprot(pgprot_val(protection_map[vm_flags &
87				(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
88			pgprot_val(arch_vm_get_page_prot(vm_flags)));
89}
90EXPORT_SYMBOL(vm_get_page_prot);
91
92static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
93{
94	return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
95}
96
97/* Update vma->vm_page_prot to reflect vma->vm_flags. */
98void vma_set_page_prot(struct vm_area_struct *vma)
99{
100	unsigned long vm_flags = vma->vm_flags;
101
102	vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
103	if (vma_wants_writenotify(vma)) {
104		vm_flags &= ~VM_SHARED;
105		vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot,
106						     vm_flags);
107	}
108}
109
110
111int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;  /* heuristic overcommit */
112int sysctl_overcommit_ratio __read_mostly = 50;	/* default is 50% */
113unsigned long sysctl_overcommit_kbytes __read_mostly;
114int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
115unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
116unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
117/*
118 * Make sure vm_committed_as in one cacheline and not cacheline shared with
119 * other variables. It can be updated by several CPUs frequently.
120 */
121struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
122
123/*
124 * The global memory commitment made in the system can be a metric
125 * that can be used to drive ballooning decisions when Linux is hosted
126 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
127 * balancing memory across competing virtual machines that are hosted.
128 * Several metrics drive this policy engine including the guest reported
129 * memory commitment.
130 */
131unsigned long vm_memory_committed(void)
132{
133	return percpu_counter_read_positive(&vm_committed_as);
134}
135EXPORT_SYMBOL_GPL(vm_memory_committed);
136
137/*
138 * Check that a process has enough memory to allocate a new virtual
139 * mapping. 0 means there is enough memory for the allocation to
140 * succeed and -ENOMEM implies there is not.
141 *
142 * We currently support three overcommit policies, which are set via the
143 * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-accounting
144 *
145 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
146 * Additional code 2002 Jul 20 by Robert Love.
147 *
148 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
149 *
150 * Note this is a helper function intended to be used by LSMs which
151 * wish to use this logic.
152 */
153int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
154{
155	unsigned long free, allowed, reserve;
156
157	VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
158			-(s64)vm_committed_as_batch * num_online_cpus(),
159			"memory commitment underflow");
160
161	vm_acct_memory(pages);
162
163	/*
164	 * Sometimes we want to use more memory than we have
165	 */
166	if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
167		return 0;
168
169	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
170		free = global_page_state(NR_FREE_PAGES);
171		free += global_page_state(NR_FILE_PAGES);
172
173		/*
174		 * shmem pages shouldn't be counted as free in this
175		 * case, they can't be purged, only swapped out, and
176		 * that won't affect the overall amount of available
177		 * memory in the system.
178		 */
179		free -= global_page_state(NR_SHMEM);
180
181		free += get_nr_swap_pages();
182
183		/*
184		 * Any slabs which are created with the
185		 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
186		 * which are reclaimable, under pressure.  The dentry
187		 * cache and most inode caches should fall into this
188		 */
189		free += global_page_state(NR_SLAB_RECLAIMABLE);
190
191		/*
192		 * Leave reserved pages. The pages are not for anonymous pages.
193		 */
194		if (free <= totalreserve_pages)
195			goto error;
196		else
197			free -= totalreserve_pages;
198
199		/*
200		 * Reserve some for root
201		 */
202		if (!cap_sys_admin)
203			free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
204
205		if (free > pages)
206			return 0;
207
208		goto error;
209	}
210
211	allowed = vm_commit_limit();
212	/*
213	 * Reserve some for root
214	 */
215	if (!cap_sys_admin)
216		allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
217
218	/*
219	 * Don't let a single process grow so big a user can't recover
220	 */
221	if (mm) {
222		reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
223		allowed -= min(mm->total_vm / 32, reserve);
224	}
225
226	if (percpu_counter_read_positive(&vm_committed_as) < allowed)
227		return 0;
228error:
229	vm_unacct_memory(pages);
230
231	return -ENOMEM;
232}
233
234/*
235 * Requires inode->i_mapping->i_mmap_mutex
236 */
237static void __remove_shared_vm_struct(struct vm_area_struct *vma,
238		struct file *file, struct address_space *mapping)
239{
240	if (vma->vm_flags & VM_DENYWRITE)
241		atomic_inc(&file_inode(file)->i_writecount);
242	if (vma->vm_flags & VM_SHARED)
243		mapping_unmap_writable(mapping);
244
245	flush_dcache_mmap_lock(mapping);
246	if (unlikely(vma->vm_flags & VM_NONLINEAR))
247		list_del_init(&vma->shared.nonlinear);
248	else
249		vma_interval_tree_remove(vma, &mapping->i_mmap);
250	flush_dcache_mmap_unlock(mapping);
251}
252
253/*
254 * Unlink a file-based vm structure from its interval tree, to hide
255 * vma from rmap and vmtruncate before freeing its page tables.
256 */
257void unlink_file_vma(struct vm_area_struct *vma)
258{
259	struct file *file = vma->vm_file;
260
261	if (file) {
262		struct address_space *mapping = file->f_mapping;
263		mutex_lock(&mapping->i_mmap_mutex);
264		__remove_shared_vm_struct(vma, file, mapping);
265		mutex_unlock(&mapping->i_mmap_mutex);
266	}
267}
268
269/*
270 * Close a vm structure and free it, returning the next.
271 */
272static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
273{
274	struct vm_area_struct *next = vma->vm_next;
275
276	might_sleep();
277	if (vma->vm_ops && vma->vm_ops->close)
278		vma->vm_ops->close(vma);
279	if (vma->vm_file)
280		fput(vma->vm_file);
281	mpol_put(vma_policy(vma));
282	kmem_cache_free(vm_area_cachep, vma);
283	return next;
284}
285
286static unsigned long do_brk(unsigned long addr, unsigned long len);
287
288SYSCALL_DEFINE1(brk, unsigned long, brk)
289{
290	unsigned long retval;
291	unsigned long newbrk, oldbrk;
292	struct mm_struct *mm = current->mm;
293	unsigned long min_brk;
294	bool populate;
295
296	down_write(&mm->mmap_sem);
297
298#ifdef CONFIG_COMPAT_BRK
299	/*
300	 * CONFIG_COMPAT_BRK can still be overridden by setting
301	 * randomize_va_space to 2, which will still cause mm->start_brk
302	 * to be arbitrarily shifted
303	 */
304	if (current->brk_randomized)
305		min_brk = mm->start_brk;
306	else
307		min_brk = mm->end_data;
308#else
309	min_brk = mm->start_brk;
310#endif
311	if (brk < min_brk)
312		goto out;
313
314	/*
315	 * Check against rlimit here. If this check is done later after the test
316	 * of oldbrk with newbrk then it can escape the test and let the data
317	 * segment grow beyond its set limit the in case where the limit is
318	 * not page aligned -Ram Gupta
319	 */
320	if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
321			      mm->end_data, mm->start_data))
322		goto out;
323
324	newbrk = PAGE_ALIGN(brk);
325	oldbrk = PAGE_ALIGN(mm->brk);
326	if (oldbrk == newbrk)
327		goto set_brk;
328
329	/* Always allow shrinking brk. */
330	if (brk <= mm->brk) {
331		if (!do_munmap(mm, newbrk, oldbrk-newbrk))
332			goto set_brk;
333		goto out;
334	}
335
336	/* Check against existing mmap mappings. */
337	if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
338		goto out;
339
340	/* Ok, looks good - let it rip. */
341	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
342		goto out;
343
344set_brk:
345	mm->brk = brk;
346	populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
347	up_write(&mm->mmap_sem);
348	if (populate)
349		mm_populate(oldbrk, newbrk - oldbrk);
350	return brk;
351
352out:
353	retval = mm->brk;
354	up_write(&mm->mmap_sem);
355	return retval;
356}
357
358static long vma_compute_subtree_gap(struct vm_area_struct *vma)
359{
360	unsigned long max, subtree_gap;
361	max = vma->vm_start;
362	if (vma->vm_prev)
363		max -= vma->vm_prev->vm_end;
364	if (vma->vm_rb.rb_left) {
365		subtree_gap = rb_entry(vma->vm_rb.rb_left,
366				struct vm_area_struct, vm_rb)->rb_subtree_gap;
367		if (subtree_gap > max)
368			max = subtree_gap;
369	}
370	if (vma->vm_rb.rb_right) {
371		subtree_gap = rb_entry(vma->vm_rb.rb_right,
372				struct vm_area_struct, vm_rb)->rb_subtree_gap;
373		if (subtree_gap > max)
374			max = subtree_gap;
375	}
376	return max;
377}
378
379#ifdef CONFIG_DEBUG_VM_RB
380static int browse_rb(struct rb_root *root)
381{
382	int i = 0, j, bug = 0;
383	struct rb_node *nd, *pn = NULL;
384	unsigned long prev = 0, pend = 0;
385
386	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
387		struct vm_area_struct *vma;
388		vma = rb_entry(nd, struct vm_area_struct, vm_rb);
389		if (vma->vm_start < prev) {
390			pr_emerg("vm_start %lx < prev %lx\n",
391				  vma->vm_start, prev);
392			bug = 1;
393		}
394		if (vma->vm_start < pend) {
395			pr_emerg("vm_start %lx < pend %lx\n",
396				  vma->vm_start, pend);
397			bug = 1;
398		}
399		if (vma->vm_start > vma->vm_end) {
400			pr_emerg("vm_start %lx > vm_end %lx\n",
401				  vma->vm_start, vma->vm_end);
402			bug = 1;
403		}
404		if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
405			pr_emerg("free gap %lx, correct %lx\n",
406			       vma->rb_subtree_gap,
407			       vma_compute_subtree_gap(vma));
408			bug = 1;
409		}
410		i++;
411		pn = nd;
412		prev = vma->vm_start;
413		pend = vma->vm_end;
414	}
415	j = 0;
416	for (nd = pn; nd; nd = rb_prev(nd))
417		j++;
418	if (i != j) {
419		pr_emerg("backwards %d, forwards %d\n", j, i);
420		bug = 1;
421	}
422	return bug ? -1 : i;
423}
424
425static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
426{
427	struct rb_node *nd;
428
429	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
430		struct vm_area_struct *vma;
431		vma = rb_entry(nd, struct vm_area_struct, vm_rb);
432		VM_BUG_ON_VMA(vma != ignore &&
433			vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
434			vma);
435	}
436}
437
438static void validate_mm(struct mm_struct *mm)
439{
440	int bug = 0;
441	int i = 0;
442	unsigned long highest_address = 0;
443	struct vm_area_struct *vma = mm->mmap;
444
445	while (vma) {
446		struct anon_vma_chain *avc;
447
448		vma_lock_anon_vma(vma);
449		list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
450			anon_vma_interval_tree_verify(avc);
451		vma_unlock_anon_vma(vma);
452		highest_address = vma->vm_end;
453		vma = vma->vm_next;
454		i++;
455	}
456	if (i != mm->map_count) {
457		pr_emerg("map_count %d vm_next %d\n", mm->map_count, i);
458		bug = 1;
459	}
460	if (highest_address != mm->highest_vm_end) {
461		pr_emerg("mm->highest_vm_end %lx, found %lx\n",
462			  mm->highest_vm_end, highest_address);
463		bug = 1;
464	}
465	i = browse_rb(&mm->mm_rb);
466	if (i != mm->map_count) {
467		if (i != -1)
468			pr_emerg("map_count %d rb %d\n", mm->map_count, i);
469		bug = 1;
470	}
471	VM_BUG_ON_MM(bug, mm);
472}
473#else
474#define validate_mm_rb(root, ignore) do { } while (0)
475#define validate_mm(mm) do { } while (0)
476#endif
477
478RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
479		     unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
480
481/*
482 * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
483 * vma->vm_prev->vm_end values changed, without modifying the vma's position
484 * in the rbtree.
485 */
486static void vma_gap_update(struct vm_area_struct *vma)
487{
488	/*
489	 * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
490	 * function that does exacltly what we want.
491	 */
492	vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
493}
494
495static inline void vma_rb_insert(struct vm_area_struct *vma,
496				 struct rb_root *root)
497{
498	/* All rb_subtree_gap values must be consistent prior to insertion */
499	validate_mm_rb(root, NULL);
500
501	rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
502}
503
504static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
505{
506	/*
507	 * All rb_subtree_gap values must be consistent prior to erase,
508	 * with the possible exception of the vma being erased.
509	 */
510	validate_mm_rb(root, vma);
511
512	/*
513	 * Note rb_erase_augmented is a fairly large inline function,
514	 * so make sure we instantiate it only once with our desired
515	 * augmented rbtree callbacks.
516	 */
517	rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
518}
519
520/*
521 * vma has some anon_vma assigned, and is already inserted on that
522 * anon_vma's interval trees.
523 *
524 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
525 * vma must be removed from the anon_vma's interval trees using
526 * anon_vma_interval_tree_pre_update_vma().
527 *
528 * After the update, the vma will be reinserted using
529 * anon_vma_interval_tree_post_update_vma().
530 *
531 * The entire update must be protected by exclusive mmap_sem and by
532 * the root anon_vma's mutex.
533 */
534static inline void
535anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
536{
537	struct anon_vma_chain *avc;
538
539	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
540		anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
541}
542
543static inline void
544anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
545{
546	struct anon_vma_chain *avc;
547
548	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
549		anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
550}
551
552static int find_vma_links(struct mm_struct *mm, unsigned long addr,
553		unsigned long end, struct vm_area_struct **pprev,
554		struct rb_node ***rb_link, struct rb_node **rb_parent)
555{
556	struct rb_node **__rb_link, *__rb_parent, *rb_prev;
557
558	__rb_link = &mm->mm_rb.rb_node;
559	rb_prev = __rb_parent = NULL;
560
561	while (*__rb_link) {
562		struct vm_area_struct *vma_tmp;
563
564		__rb_parent = *__rb_link;
565		vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
566
567		if (vma_tmp->vm_end > addr) {
568			/* Fail if an existing vma overlaps the area */
569			if (vma_tmp->vm_start < end)
570				return -ENOMEM;
571			__rb_link = &__rb_parent->rb_left;
572		} else {
573			rb_prev = __rb_parent;
574			__rb_link = &__rb_parent->rb_right;
575		}
576	}
577
578	*pprev = NULL;
579	if (rb_prev)
580		*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
581	*rb_link = __rb_link;
582	*rb_parent = __rb_parent;
583	return 0;
584}
585
586static unsigned long count_vma_pages_range(struct mm_struct *mm,
587		unsigned long addr, unsigned long end)
588{
589	unsigned long nr_pages = 0;
590	struct vm_area_struct *vma;
591
592	/* Find first overlaping mapping */
593	vma = find_vma_intersection(mm, addr, end);
594	if (!vma)
595		return 0;
596
597	nr_pages = (min(end, vma->vm_end) -
598		max(addr, vma->vm_start)) >> PAGE_SHIFT;
599
600	/* Iterate over the rest of the overlaps */
601	for (vma = vma->vm_next; vma; vma = vma->vm_next) {
602		unsigned long overlap_len;
603
604		if (vma->vm_start > end)
605			break;
606
607		overlap_len = min(end, vma->vm_end) - vma->vm_start;
608		nr_pages += overlap_len >> PAGE_SHIFT;
609	}
610
611	return nr_pages;
612}
613
614void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
615		struct rb_node **rb_link, struct rb_node *rb_parent)
616{
617	/* Update tracking information for the gap following the new vma. */
618	if (vma->vm_next)
619		vma_gap_update(vma->vm_next);
620	else
621		mm->highest_vm_end = vma->vm_end;
622
623	/*
624	 * vma->vm_prev wasn't known when we followed the rbtree to find the
625	 * correct insertion point for that vma. As a result, we could not
626	 * update the vma vm_rb parents rb_subtree_gap values on the way down.
627	 * So, we first insert the vma with a zero rb_subtree_gap value
628	 * (to be consistent with what we did on the way down), and then
629	 * immediately update the gap to the correct value. Finally we
630	 * rebalance the rbtree after all augmented values have been set.
631	 */
632	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
633	vma->rb_subtree_gap = 0;
634	vma_gap_update(vma);
635	vma_rb_insert(vma, &mm->mm_rb);
636}
637
638static void __vma_link_file(struct vm_area_struct *vma)
639{
640	struct file *file;
641
642	file = vma->vm_file;
643	if (file) {
644		struct address_space *mapping = file->f_mapping;
645
646		if (vma->vm_flags & VM_DENYWRITE)
647			atomic_dec(&file_inode(file)->i_writecount);
648		if (vma->vm_flags & VM_SHARED)
649			atomic_inc(&mapping->i_mmap_writable);
650
651		flush_dcache_mmap_lock(mapping);
652		if (unlikely(vma->vm_flags & VM_NONLINEAR))
653			vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
654		else
655			vma_interval_tree_insert(vma, &mapping->i_mmap);
656		flush_dcache_mmap_unlock(mapping);
657	}
658}
659
660static void
661__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
662	struct vm_area_struct *prev, struct rb_node **rb_link,
663	struct rb_node *rb_parent)
664{
665	__vma_link_list(mm, vma, prev, rb_parent);
666	__vma_link_rb(mm, vma, rb_link, rb_parent);
667}
668
669static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
670			struct vm_area_struct *prev, struct rb_node **rb_link,
671			struct rb_node *rb_parent)
672{
673	struct address_space *mapping = NULL;
674
675	if (vma->vm_file) {
676		mapping = vma->vm_file->f_mapping;
677		mutex_lock(&mapping->i_mmap_mutex);
678	}
679
680	__vma_link(mm, vma, prev, rb_link, rb_parent);
681	__vma_link_file(vma);
682
683	if (mapping)
684		mutex_unlock(&mapping->i_mmap_mutex);
685
686	mm->map_count++;
687	validate_mm(mm);
688}
689
690/*
691 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
692 * mm's list and rbtree.  It has already been inserted into the interval tree.
693 */
694static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
695{
696	struct vm_area_struct *prev;
697	struct rb_node **rb_link, *rb_parent;
698
699	if (find_vma_links(mm, vma->vm_start, vma->vm_end,
700			   &prev, &rb_link, &rb_parent))
701		BUG();
702	__vma_link(mm, vma, prev, rb_link, rb_parent);
703	mm->map_count++;
704}
705
706static inline void
707__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
708		struct vm_area_struct *prev)
709{
710	struct vm_area_struct *next;
711
712	vma_rb_erase(vma, &mm->mm_rb);
713	prev->vm_next = next = vma->vm_next;
714	if (next)
715		next->vm_prev = prev;
716
717	/* Kill the cache */
718	vmacache_invalidate(mm);
719}
720
721/*
722 * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
723 * is already present in an i_mmap tree without adjusting the tree.
724 * The following helper function should be used when such adjustments
725 * are necessary.  The "insert" vma (if any) is to be inserted
726 * before we drop the necessary locks.
727 */
728int vma_adjust(struct vm_area_struct *vma, unsigned long start,
729	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
730{
731	struct mm_struct *mm = vma->vm_mm;
732	struct vm_area_struct *next = vma->vm_next;
733	struct vm_area_struct *importer = NULL;
734	struct address_space *mapping = NULL;
735	struct rb_root *root = NULL;
736	struct anon_vma *anon_vma = NULL;
737	struct file *file = vma->vm_file;
738	bool start_changed = false, end_changed = false;
739	long adjust_next = 0;
740	int remove_next = 0;
741
742	if (next && !insert) {
743		struct vm_area_struct *exporter = NULL;
744
745		if (end >= next->vm_end) {
746			/*
747			 * vma expands, overlapping all the next, and
748			 * perhaps the one after too (mprotect case 6).
749			 */
750again:			remove_next = 1 + (end > next->vm_end);
751			end = next->vm_end;
752			exporter = next;
753			importer = vma;
754		} else if (end > next->vm_start) {
755			/*
756			 * vma expands, overlapping part of the next:
757			 * mprotect case 5 shifting the boundary up.
758			 */
759			adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
760			exporter = next;
761			importer = vma;
762		} else if (end < vma->vm_end) {
763			/*
764			 * vma shrinks, and !insert tells it's not
765			 * split_vma inserting another: so it must be
766			 * mprotect case 4 shifting the boundary down.
767			 */
768			adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
769			exporter = vma;
770			importer = next;
771		}
772
773		/*
774		 * Easily overlooked: when mprotect shifts the boundary,
775		 * make sure the expanding vma has anon_vma set if the
776		 * shrinking vma had, to cover any anon pages imported.
777		 */
778		if (exporter && exporter->anon_vma && !importer->anon_vma) {
779			int error;
780
781			error = anon_vma_clone(importer, exporter);
782			if (error)
783				return error;
784			importer->anon_vma = exporter->anon_vma;
785		}
786	}
787
788	if (file) {
789		mapping = file->f_mapping;
790		if (!(vma->vm_flags & VM_NONLINEAR)) {
791			root = &mapping->i_mmap;
792			uprobe_munmap(vma, vma->vm_start, vma->vm_end);
793
794			if (adjust_next)
795				uprobe_munmap(next, next->vm_start,
796							next->vm_end);
797		}
798
799		mutex_lock(&mapping->i_mmap_mutex);
800		if (insert) {
801			/*
802			 * Put into interval tree now, so instantiated pages
803			 * are visible to arm/parisc __flush_dcache_page
804			 * throughout; but we cannot insert into address
805			 * space until vma start or end is updated.
806			 */
807			__vma_link_file(insert);
808		}
809	}
810
811	vma_adjust_trans_huge(vma, start, end, adjust_next);
812
813	anon_vma = vma->anon_vma;
814	if (!anon_vma && adjust_next)
815		anon_vma = next->anon_vma;
816	if (anon_vma) {
817		VM_BUG_ON_VMA(adjust_next && next->anon_vma &&
818			  anon_vma != next->anon_vma, next);
819		anon_vma_lock_write(anon_vma);
820		anon_vma_interval_tree_pre_update_vma(vma);
821		if (adjust_next)
822			anon_vma_interval_tree_pre_update_vma(next);
823	}
824
825	if (root) {
826		flush_dcache_mmap_lock(mapping);
827		vma_interval_tree_remove(vma, root);
828		if (adjust_next)
829			vma_interval_tree_remove(next, root);
830	}
831
832	if (start != vma->vm_start) {
833		vma->vm_start = start;
834		start_changed = true;
835	}
836	if (end != vma->vm_end) {
837		vma->vm_end = end;
838		end_changed = true;
839	}
840	vma->vm_pgoff = pgoff;
841	if (adjust_next) {
842		next->vm_start += adjust_next << PAGE_SHIFT;
843		next->vm_pgoff += adjust_next;
844	}
845
846	if (root) {
847		if (adjust_next)
848			vma_interval_tree_insert(next, root);
849		vma_interval_tree_insert(vma, root);
850		flush_dcache_mmap_unlock(mapping);
851	}
852
853	if (remove_next) {
854		/*
855		 * vma_merge has merged next into vma, and needs
856		 * us to remove next before dropping the locks.
857		 */
858		__vma_unlink(mm, next, vma);
859		if (file)
860			__remove_shared_vm_struct(next, file, mapping);
861	} else if (insert) {
862		/*
863		 * split_vma has split insert from vma, and needs
864		 * us to insert it before dropping the locks
865		 * (it may either follow vma or precede it).
866		 */
867		__insert_vm_struct(mm, insert);
868	} else {
869		if (start_changed)
870			vma_gap_update(vma);
871		if (end_changed) {
872			if (!next)
873				mm->highest_vm_end = end;
874			else if (!adjust_next)
875				vma_gap_update(next);
876		}
877	}
878
879	if (anon_vma) {
880		anon_vma_interval_tree_post_update_vma(vma);
881		if (adjust_next)
882			anon_vma_interval_tree_post_update_vma(next);
883		anon_vma_unlock_write(anon_vma);
884	}
885	if (mapping)
886		mutex_unlock(&mapping->i_mmap_mutex);
887
888	if (root) {
889		uprobe_mmap(vma);
890
891		if (adjust_next)
892			uprobe_mmap(next);
893	}
894
895	if (remove_next) {
896		if (file) {
897			uprobe_munmap(next, next->vm_start, next->vm_end);
898			fput(file);
899		}
900		if (next->anon_vma)
901			anon_vma_merge(vma, next);
902		mm->map_count--;
903		mpol_put(vma_policy(next));
904		kmem_cache_free(vm_area_cachep, next);
905		/*
906		 * In mprotect's case 6 (see comments on vma_merge),
907		 * we must remove another next too. It would clutter
908		 * up the code too much to do both in one go.
909		 */
910		next = vma->vm_next;
911		if (remove_next == 2)
912			goto again;
913		else if (next)
914			vma_gap_update(next);
915		else
916			mm->highest_vm_end = end;
917	}
918	if (insert && file)
919		uprobe_mmap(insert);
920
921	validate_mm(mm);
922
923	return 0;
924}
925
926/*
927 * If the vma has a ->close operation then the driver probably needs to release
928 * per-vma resources, so we don't attempt to merge those.
929 */
930static inline int is_mergeable_vma(struct vm_area_struct *vma,
931			struct file *file, unsigned long vm_flags,
932			const char __user *anon_name)
933{
934	/*
935	 * VM_SOFTDIRTY should not prevent from VMA merging, if we
936	 * match the flags but dirty bit -- the caller should mark
937	 * merged VMA as dirty. If dirty bit won't be excluded from
938	 * comparison, we increase pressue on the memory system forcing
939	 * the kernel to generate new VMAs when old one could be
940	 * extended instead.
941	 */
942	if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
943		return 0;
944	if (vma->vm_file != file)
945		return 0;
946	if (vma->vm_ops && vma->vm_ops->close)
947		return 0;
948	if (vma_get_anon_name(vma) != anon_name)
949		return 0;
950	return 1;
951}
952
953static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
954					struct anon_vma *anon_vma2,
955					struct vm_area_struct *vma)
956{
957	/*
958	 * The list_is_singular() test is to avoid merging VMA cloned from
959	 * parents. This can improve scalability caused by anon_vma lock.
960	 */
961	if ((!anon_vma1 || !anon_vma2) && (!vma ||
962		list_is_singular(&vma->anon_vma_chain)))
963		return 1;
964	return anon_vma1 == anon_vma2;
965}
966
967/*
968 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
969 * in front of (at a lower virtual address and file offset than) the vma.
970 *
971 * We cannot merge two vmas if they have differently assigned (non-NULL)
972 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
973 *
974 * We don't check here for the merged mmap wrapping around the end of pagecache
975 * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
976 * wrap, nor mmaps which cover the final page at index -1UL.
977 */
978static int
979can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
980	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff,
981	const char __user *anon_name)
982{
983	if (is_mergeable_vma(vma, file, vm_flags, anon_name) &&
984	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
985		if (vma->vm_pgoff == vm_pgoff)
986			return 1;
987	}
988	return 0;
989}
990
991/*
992 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
993 * beyond (at a higher virtual address and file offset than) the vma.
994 *
995 * We cannot merge two vmas if they have differently assigned (non-NULL)
996 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
997 */
998static int
999can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
1000	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff,
1001	const char __user *anon_name)
1002{
1003	if (is_mergeable_vma(vma, file, vm_flags, anon_name) &&
1004	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
1005		pgoff_t vm_pglen;
1006		vm_pglen = vma_pages(vma);
1007		if (vma->vm_pgoff + vm_pglen == vm_pgoff)
1008			return 1;
1009	}
1010	return 0;
1011}
1012
1013/*
1014 * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
1015 * figure out whether that can be merged with its predecessor or its
1016 * successor.  Or both (it neatly fills a hole).
1017 *
1018 * In most cases - when called for mmap, brk or mremap - [addr,end) is
1019 * certain not to be mapped by the time vma_merge is called; but when
1020 * called for mprotect, it is certain to be already mapped (either at
1021 * an offset within prev, or at the start of next), and the flags of
1022 * this area are about to be changed to vm_flags - and the no-change
1023 * case has already been eliminated.
1024 *
1025 * The following mprotect cases have to be considered, where AAAA is
1026 * the area passed down from mprotect_fixup, never extending beyond one
1027 * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
1028 *
1029 *     AAAA             AAAA                AAAA          AAAA
1030 *    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPPPNNNNNN    PPPPNNNNXXXX
1031 *    cannot merge    might become    might become    might become
1032 *                    PPNNNNNNNNNN    PPPPPPPPPPNN    PPPPPPPPPPPP 6 or
1033 *    mmap, brk or    case 4 below    case 5 below    PPPPPPPPXXXX 7 or
1034 *    mremap move:                                    PPPPNNNNNNNN 8
1035 *        AAAA
1036 *    PPPP    NNNN    PPPPPPPPPPPP    PPPPPPPPNNNN    PPPPNNNNNNNN
1037 *    might become    case 1 below    case 2 below    case 3 below
1038 *
1039 * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
1040 * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
1041 */
1042struct vm_area_struct *vma_merge(struct mm_struct *mm,
1043			struct vm_area_struct *prev, unsigned long addr,
1044			unsigned long end, unsigned long vm_flags,
1045		    struct anon_vma *anon_vma, struct file *file,
1046			pgoff_t pgoff, struct mempolicy *policy,
1047			const char __user *anon_name)
1048{
1049	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
1050	struct vm_area_struct *area, *next;
1051	int err;
1052
1053	/*
1054	 * We later require that vma->vm_flags == vm_flags,
1055	 * so this tests vma->vm_flags & VM_SPECIAL, too.
1056	 */
1057	if (vm_flags & VM_SPECIAL)
1058		return NULL;
1059
1060	if (prev)
1061		next = prev->vm_next;
1062	else
1063		next = mm->mmap;
1064	area = next;
1065	if (next && next->vm_end == end)		/* cases 6, 7, 8 */
1066		next = next->vm_next;
1067
1068	/*
1069	 * Can it merge with the predecessor?
1070	 */
1071	if (prev && prev->vm_end == addr &&
1072  			mpol_equal(vma_policy(prev), policy) &&
1073			can_vma_merge_after(prev, vm_flags, anon_vma,
1074						file, pgoff, anon_name)) {
1075		/*
1076		 * OK, it can.  Can we now merge in the successor as well?
1077		 */
1078		if (next && end == next->vm_start &&
1079				mpol_equal(policy, vma_policy(next)) &&
1080				can_vma_merge_before(next, vm_flags, anon_vma,
1081						file, pgoff+pglen, anon_name) &&
1082				is_mergeable_anon_vma(prev->anon_vma,
1083						      next->anon_vma, NULL)) {
1084							/* cases 1, 6 */
1085			err = vma_adjust(prev, prev->vm_start,
1086				next->vm_end, prev->vm_pgoff, NULL);
1087		} else					/* cases 2, 5, 7 */
1088			err = vma_adjust(prev, prev->vm_start,
1089				end, prev->vm_pgoff, NULL);
1090		if (err)
1091			return NULL;
1092		khugepaged_enter_vma_merge(prev, vm_flags);
1093		return prev;
1094	}
1095
1096	/*
1097	 * Can this new request be merged in front of next?
1098	 */
1099	if (next && end == next->vm_start &&
1100 			mpol_equal(policy, vma_policy(next)) &&
1101			can_vma_merge_before(next, vm_flags, anon_vma,
1102					file, pgoff+pglen, anon_name)) {
1103		if (prev && addr < prev->vm_end)	/* case 4 */
1104			err = vma_adjust(prev, prev->vm_start,
1105				addr, prev->vm_pgoff, NULL);
1106		else					/* cases 3, 8 */
1107			err = vma_adjust(area, addr, next->vm_end,
1108				next->vm_pgoff - pglen, NULL);
1109		if (err)
1110			return NULL;
1111		khugepaged_enter_vma_merge(area, vm_flags);
1112		return area;
1113	}
1114
1115	return NULL;
1116}
1117
1118/*
1119 * Rough compatbility check to quickly see if it's even worth looking
1120 * at sharing an anon_vma.
1121 *
1122 * They need to have the same vm_file, and the flags can only differ
1123 * in things that mprotect may change.
1124 *
1125 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
1126 * we can merge the two vma's. For example, we refuse to merge a vma if
1127 * there is a vm_ops->close() function, because that indicates that the
1128 * driver is doing some kind of reference counting. But that doesn't
1129 * really matter for the anon_vma sharing case.
1130 */
1131static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
1132{
1133	return a->vm_end == b->vm_start &&
1134		mpol_equal(vma_policy(a), vma_policy(b)) &&
1135		a->vm_file == b->vm_file &&
1136		!((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) &&
1137		b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
1138}
1139
1140/*
1141 * Do some basic sanity checking to see if we can re-use the anon_vma
1142 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
1143 * the same as 'old', the other will be the new one that is trying
1144 * to share the anon_vma.
1145 *
1146 * NOTE! This runs with mm_sem held for reading, so it is possible that
1147 * the anon_vma of 'old' is concurrently in the process of being set up
1148 * by another page fault trying to merge _that_. But that's ok: if it
1149 * is being set up, that automatically means that it will be a singleton
1150 * acceptable for merging, so we can do all of this optimistically. But
1151 * we do that ACCESS_ONCE() to make sure that we never re-load the pointer.
1152 *
1153 * IOW: that the "list_is_singular()" test on the anon_vma_chain only
1154 * matters for the 'stable anon_vma' case (ie the thing we want to avoid
1155 * is to return an anon_vma that is "complex" due to having gone through
1156 * a fork).
1157 *
1158 * We also make sure that the two vma's are compatible (adjacent,
1159 * and with the same memory policies). That's all stable, even with just
1160 * a read lock on the mm_sem.
1161 */
1162static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
1163{
1164	if (anon_vma_compatible(a, b)) {
1165		struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma);
1166
1167		if (anon_vma && list_is_singular(&old->anon_vma_chain))
1168			return anon_vma;
1169	}
1170	return NULL;
1171}
1172
1173/*
1174 * find_mergeable_anon_vma is used by anon_vma_prepare, to check
1175 * neighbouring vmas for a suitable anon_vma, before it goes off
1176 * to allocate a new anon_vma.  It checks because a repetitive
1177 * sequence of mprotects and faults may otherwise lead to distinct
1178 * anon_vmas being allocated, preventing vma merge in subsequent
1179 * mprotect.
1180 */
1181struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
1182{
1183	struct anon_vma *anon_vma;
1184	struct vm_area_struct *near;
1185
1186	near = vma->vm_next;
1187	if (!near)
1188		goto try_prev;
1189
1190	anon_vma = reusable_anon_vma(near, vma, near);
1191	if (anon_vma)
1192		return anon_vma;
1193try_prev:
1194	near = vma->vm_prev;
1195	if (!near)
1196		goto none;
1197
1198	anon_vma = reusable_anon_vma(near, near, vma);
1199	if (anon_vma)
1200		return anon_vma;
1201none:
1202	/*
1203	 * There's no absolute need to look only at touching neighbours:
1204	 * we could search further afield for "compatible" anon_vmas.
1205	 * But it would probably just be a waste of time searching,
1206	 * or lead to too many vmas hanging off the same anon_vma.
1207	 * We're trying to allow mprotect remerging later on,
1208	 * not trying to minimize memory used for anon_vmas.
1209	 */
1210	return NULL;
1211}
1212
1213#ifdef CONFIG_PROC_FS
1214void vm_stat_account(struct mm_struct *mm, unsigned long flags,
1215						struct file *file, long pages)
1216{
1217	const unsigned long stack_flags
1218		= VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
1219
1220	mm->total_vm += pages;
1221
1222	if (file) {
1223		mm->shared_vm += pages;
1224		if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
1225			mm->exec_vm += pages;
1226	} else if (flags & stack_flags)
1227		mm->stack_vm += pages;
1228}
1229#endif /* CONFIG_PROC_FS */
1230
1231/*
1232 * If a hint addr is less than mmap_min_addr change hint to be as
1233 * low as possible but still greater than mmap_min_addr
1234 */
1235static inline unsigned long round_hint_to_min(unsigned long hint)
1236{
1237	hint &= PAGE_MASK;
1238	if (((void *)hint != NULL) &&
1239	    (hint < mmap_min_addr))
1240		return PAGE_ALIGN(mmap_min_addr);
1241	return hint;
1242}
1243
1244static inline int mlock_future_check(struct mm_struct *mm,
1245				     unsigned long flags,
1246				     unsigned long len)
1247{
1248	unsigned long locked, lock_limit;
1249
1250	/*  mlock MCL_FUTURE? */
1251	if (flags & VM_LOCKED) {
1252		locked = len >> PAGE_SHIFT;
1253		locked += mm->locked_vm;
1254		lock_limit = rlimit(RLIMIT_MEMLOCK);
1255		lock_limit >>= PAGE_SHIFT;
1256		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
1257			return -EAGAIN;
1258	}
1259	return 0;
1260}
1261
1262/*
1263 * The caller must hold down_write(&current->mm->mmap_sem).
1264 */
1265
1266unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1267			unsigned long len, unsigned long prot,
1268			unsigned long flags, unsigned long pgoff,
1269			unsigned long *populate)
1270{
1271	struct mm_struct *mm = current->mm;
1272	vm_flags_t vm_flags;
1273
1274	*populate = 0;
1275
1276	/*
1277	 * Does the application expect PROT_READ to imply PROT_EXEC?
1278	 *
1279	 * (the exception is when the underlying filesystem is noexec
1280	 *  mounted, in which case we dont add PROT_EXEC.)
1281	 */
1282	if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
1283		if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
1284			prot |= PROT_EXEC;
1285
1286	if (!len)
1287		return -EINVAL;
1288
1289	if (!(flags & MAP_FIXED))
1290		addr = round_hint_to_min(addr);
1291
1292	/* Careful about overflows.. */
1293	len = PAGE_ALIGN(len);
1294	if (!len)
1295		return -ENOMEM;
1296
1297	/* offset overflow? */
1298	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
1299		return -EOVERFLOW;
1300
1301	/* Too many mappings? */
1302	if (mm->map_count > sysctl_max_map_count)
1303		return -ENOMEM;
1304
1305	/* Obtain the address to map to. we verify (or select) it and ensure
1306	 * that it represents a valid section of the address space.
1307	 */
1308	addr = get_unmapped_area(file, addr, len, pgoff, flags);
1309	if (addr & ~PAGE_MASK)
1310		return addr;
1311
1312	/* Do simple checking here so the lower-level routines won't have
1313	 * to. we assume access permissions have been handled by the open
1314	 * of the memory object, so we don't do any here.
1315	 */
1316	vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
1317			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1318
1319	if (flags & MAP_LOCKED)
1320		if (!can_do_mlock())
1321			return -EPERM;
1322
1323	if (mlock_future_check(mm, vm_flags, len))
1324		return -EAGAIN;
1325
1326	if (file) {
1327		struct inode *inode = file_inode(file);
1328
1329		switch (flags & MAP_TYPE) {
1330		case MAP_SHARED:
1331			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
1332				return -EACCES;
1333
1334			/*
1335			 * Make sure we don't allow writing to an append-only
1336			 * file..
1337			 */
1338			if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
1339				return -EACCES;
1340
1341			/*
1342			 * Make sure there are no mandatory locks on the file.
1343			 */
1344			if (locks_verify_locked(file))
1345				return -EAGAIN;
1346
1347			vm_flags |= VM_SHARED | VM_MAYSHARE;
1348			if (!(file->f_mode & FMODE_WRITE))
1349				vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
1350
1351			/* fall through */
1352		case MAP_PRIVATE:
1353			if (!(file->f_mode & FMODE_READ))
1354				return -EACCES;
1355			if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
1356				if (vm_flags & VM_EXEC)
1357					return -EPERM;
1358				vm_flags &= ~VM_MAYEXEC;
1359			}
1360
1361			if (!file->f_op->mmap)
1362				return -ENODEV;
1363			if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1364				return -EINVAL;
1365			break;
1366
1367		default:
1368			return -EINVAL;
1369		}
1370	} else {
1371		switch (flags & MAP_TYPE) {
1372		case MAP_SHARED:
1373			if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1374				return -EINVAL;
1375			/*
1376			 * Ignore pgoff.
1377			 */
1378			pgoff = 0;
1379			vm_flags |= VM_SHARED | VM_MAYSHARE;
1380			break;
1381		case MAP_PRIVATE:
1382			/*
1383			 * Set pgoff according to addr for anon_vma.
1384			 */
1385			pgoff = addr >> PAGE_SHIFT;
1386			break;
1387		default:
1388			return -EINVAL;
1389		}
1390	}
1391
1392	/*
1393	 * Set 'VM_NORESERVE' if we should not account for the
1394	 * memory use of this mapping.
1395	 */
1396	if (flags & MAP_NORESERVE) {
1397		/* We honor MAP_NORESERVE if allowed to overcommit */
1398		if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
1399			vm_flags |= VM_NORESERVE;
1400
1401		/* hugetlb applies strict overcommit unless MAP_NORESERVE */
1402		if (file && is_file_hugepages(file))
1403			vm_flags |= VM_NORESERVE;
1404	}
1405
1406	addr = mmap_region(file, addr, len, vm_flags, pgoff);
1407	if (!IS_ERR_VALUE(addr) &&
1408	    ((vm_flags & VM_LOCKED) ||
1409	     (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
1410		*populate = len;
1411	return addr;
1412}
1413
1414SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1415		unsigned long, prot, unsigned long, flags,
1416		unsigned long, fd, unsigned long, pgoff)
1417{
1418	struct file *file = NULL;
1419	unsigned long retval = -EBADF;
1420
1421	if (!(flags & MAP_ANONYMOUS)) {
1422		audit_mmap_fd(fd, flags);
1423		file = fget(fd);
1424		if (!file)
1425			goto out;
1426		if (is_file_hugepages(file))
1427			len = ALIGN(len, huge_page_size(hstate_file(file)));
1428		retval = -EINVAL;
1429		if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
1430			goto out_fput;
1431	} else if (flags & MAP_HUGETLB) {
1432		struct user_struct *user = NULL;
1433		struct hstate *hs;
1434
1435		hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK);
1436		if (!hs)
1437			return -EINVAL;
1438
1439		len = ALIGN(len, huge_page_size(hs));
1440		/*
1441		 * VM_NORESERVE is used because the reservations will be
1442		 * taken when vm_ops->mmap() is called
1443		 * A dummy user value is used because we are not locking
1444		 * memory so no accounting is necessary
1445		 */
1446		file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
1447				VM_NORESERVE,
1448				&user, HUGETLB_ANONHUGE_INODE,
1449				(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
1450		if (IS_ERR(file))
1451			return PTR_ERR(file);
1452	}
1453
1454	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1455
1456	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1457out_fput:
1458	if (file)
1459		fput(file);
1460out:
1461	return retval;
1462}
1463
1464#ifdef __ARCH_WANT_SYS_OLD_MMAP
1465struct mmap_arg_struct {
1466	unsigned long addr;
1467	unsigned long len;
1468	unsigned long prot;
1469	unsigned long flags;
1470	unsigned long fd;
1471	unsigned long offset;
1472};
1473
1474SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1475{
1476	struct mmap_arg_struct a;
1477
1478	if (copy_from_user(&a, arg, sizeof(a)))
1479		return -EFAULT;
1480	if (a.offset & ~PAGE_MASK)
1481		return -EINVAL;
1482
1483	return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
1484			      a.offset >> PAGE_SHIFT);
1485}
1486#endif /* __ARCH_WANT_SYS_OLD_MMAP */
1487
1488/*
1489 * Some shared mappigns will want the pages marked read-only
1490 * to track write events. If so, we'll downgrade vm_page_prot
1491 * to the private version (using protection_map[] without the
1492 * VM_SHARED bit).
1493 */
1494int vma_wants_writenotify(struct vm_area_struct *vma)
1495{
1496	vm_flags_t vm_flags = vma->vm_flags;
1497
1498	/* If it was private or non-writable, the write bit is already clear */
1499	if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
1500		return 0;
1501
1502	/* The backer wishes to know when pages are first written to? */
1503	if (vma->vm_ops && vma->vm_ops->page_mkwrite)
1504		return 1;
1505
1506	/* The open routine did something to the protections that pgprot_modify
1507	 * won't preserve? */
1508	if (pgprot_val(vma->vm_page_prot) !=
1509	    pgprot_val(vm_pgprot_modify(vma->vm_page_prot, vm_flags)))
1510		return 0;
1511
1512	/* Do we need to track softdirty? */
1513	if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY))
1514		return 1;
1515
1516	/* Specialty mapping? */
1517	if (vm_flags & VM_PFNMAP)
1518		return 0;
1519
1520	/* Can the mapping track the dirty pages? */
1521	return vma->vm_file && vma->vm_file->f_mapping &&
1522		mapping_cap_account_dirty(vma->vm_file->f_mapping);
1523}
1524
1525/*
1526 * We account for memory if it's a private writeable mapping,
1527 * not hugepages and VM_NORESERVE wasn't set.
1528 */
1529static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
1530{
1531	/*
1532	 * hugetlb has its own accounting separate from the core VM
1533	 * VM_HUGETLB may not be set yet so we cannot check for that flag.
1534	 */
1535	if (file && is_file_hugepages(file))
1536		return 0;
1537
1538	return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
1539}
1540
1541unsigned long mmap_region(struct file *file, unsigned long addr,
1542		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
1543{
1544	struct mm_struct *mm = current->mm;
1545	struct vm_area_struct *vma, *prev;
1546	int error;
1547	struct rb_node **rb_link, *rb_parent;
1548	unsigned long charged = 0;
1549
1550	/* Check against address space limit. */
1551	if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
1552		unsigned long nr_pages;
1553
1554		/*
1555		 * MAP_FIXED may remove pages of mappings that intersects with
1556		 * requested mapping. Account for the pages it would unmap.
1557		 */
1558		if (!(vm_flags & MAP_FIXED))
1559			return -ENOMEM;
1560
1561		nr_pages = count_vma_pages_range(mm, addr, addr + len);
1562
1563		if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
1564			return -ENOMEM;
1565	}
1566
1567	/* Clear old maps */
1568	error = -ENOMEM;
1569munmap_back:
1570	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
1571		if (do_munmap(mm, addr, len))
1572			return -ENOMEM;
1573		goto munmap_back;
1574	}
1575
1576	/*
1577	 * Private writable mapping: check memory availability
1578	 */
1579	if (accountable_mapping(file, vm_flags)) {
1580		charged = len >> PAGE_SHIFT;
1581		if (security_vm_enough_memory_mm(mm, charged))
1582			return -ENOMEM;
1583		vm_flags |= VM_ACCOUNT;
1584	}
1585
1586	/*
1587	 * Can we just expand an old mapping?
1588	 */
1589	vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
1590			NULL, NULL);
1591	if (vma)
1592		goto out;
1593
1594	/*
1595	 * Determine the object being mapped and call the appropriate
1596	 * specific mapper. the address has already been validated, but
1597	 * not unmapped, but the maps are removed from the list.
1598	 */
1599	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
1600	if (!vma) {
1601		error = -ENOMEM;
1602		goto unacct_error;
1603	}
1604
1605	vma->vm_mm = mm;
1606	vma->vm_start = addr;
1607	vma->vm_end = addr + len;
1608	vma->vm_flags = vm_flags;
1609	vma->vm_page_prot = vm_get_page_prot(vm_flags);
1610	vma->vm_pgoff = pgoff;
1611	INIT_LIST_HEAD(&vma->anon_vma_chain);
1612
1613	if (file) {
1614		if (vm_flags & VM_DENYWRITE) {
1615			error = deny_write_access(file);
1616			if (error)
1617				goto free_vma;
1618		}
1619		if (vm_flags & VM_SHARED) {
1620			error = mapping_map_writable(file->f_mapping);
1621			if (error)
1622				goto allow_write_and_free_vma;
1623		}
1624
1625		/* ->mmap() can change vma->vm_file, but must guarantee that
1626		 * vma_link() below can deny write-access if VM_DENYWRITE is set
1627		 * and map writably if VM_SHARED is set. This usually means the
1628		 * new file must not have been exposed to user-space, yet.
1629		 */
1630		vma->vm_file = get_file(file);
1631		error = file->f_op->mmap(file, vma);
1632		if (error)
1633			goto unmap_and_free_vma;
1634
1635		/* Can addr have changed??
1636		 *
1637		 * Answer: Yes, several device drivers can do it in their
1638		 *         f_op->mmap method. -DaveM
1639		 * Bug: If addr is changed, prev, rb_link, rb_parent should
1640		 *      be updated for vma_link()
1641		 */
1642		WARN_ON_ONCE(addr != vma->vm_start);
1643
1644		addr = vma->vm_start;
1645		vm_flags = vma->vm_flags;
1646	} else if (vm_flags & VM_SHARED) {
1647		error = shmem_zero_setup(vma);
1648		if (error)
1649			goto free_vma;
1650	}
1651
1652	vma_link(mm, vma, prev, rb_link, rb_parent);
1653	/* Once vma denies write, undo our temporary denial count */
1654	if (file) {
1655		if (vm_flags & VM_SHARED)
1656			mapping_unmap_writable(file->f_mapping);
1657		if (vm_flags & VM_DENYWRITE)
1658			allow_write_access(file);
1659	}
1660	file = vma->vm_file;
1661out:
1662	perf_event_mmap(vma);
1663
1664	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1665	if (vm_flags & VM_LOCKED) {
1666		if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
1667					vma == get_gate_vma(current->mm)))
1668			mm->locked_vm += (len >> PAGE_SHIFT);
1669		else
1670			vma->vm_flags &= ~VM_LOCKED;
1671	}
1672
1673	if (file)
1674		uprobe_mmap(vma);
1675
1676	/*
1677	 * New (or expanded) vma always get soft dirty status.
1678	 * Otherwise user-space soft-dirty page tracker won't
1679	 * be able to distinguish situation when vma area unmapped,
1680	 * then new mapped in-place (which must be aimed as
1681	 * a completely new data area).
1682	 */
1683	vma->vm_flags |= VM_SOFTDIRTY;
1684
1685	vma_set_page_prot(vma);
1686
1687	return addr;
1688
1689unmap_and_free_vma:
1690	vma->vm_file = NULL;
1691	fput(file);
1692
1693	/* Undo any partial mapping done by a device driver. */
1694	unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
1695	charged = 0;
1696	if (vm_flags & VM_SHARED)
1697		mapping_unmap_writable(file->f_mapping);
1698allow_write_and_free_vma:
1699	if (vm_flags & VM_DENYWRITE)
1700		allow_write_access(file);
1701free_vma:
1702	kmem_cache_free(vm_area_cachep, vma);
1703unacct_error:
1704	if (charged)
1705		vm_unacct_memory(charged);
1706	return error;
1707}
1708
1709unsigned long unmapped_area(struct vm_unmapped_area_info *info)
1710{
1711	/*
1712	 * We implement the search by looking for an rbtree node that
1713	 * immediately follows a suitable gap. That is,
1714	 * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
1715	 * - gap_end   = vma->vm_start        >= info->low_limit  + length;
1716	 * - gap_end - gap_start >= length
1717	 */
1718
1719	struct mm_struct *mm = current->mm;
1720	struct vm_area_struct *vma;
1721	unsigned long length, low_limit, high_limit, gap_start, gap_end;
1722
1723	/* Adjust search length to account for worst case alignment overhead */
1724	length = info->length + info->align_mask;
1725	if (length < info->length)
1726		return -ENOMEM;
1727
1728	/* Adjust search limits by the desired length */
1729	if (info->high_limit < length)
1730		return -ENOMEM;
1731	high_limit = info->high_limit - length;
1732
1733	if (info->low_limit > high_limit)
1734		return -ENOMEM;
1735	low_limit = info->low_limit + length;
1736
1737	/* Check if rbtree root looks promising */
1738	if (RB_EMPTY_ROOT(&mm->mm_rb))
1739		goto check_highest;
1740	vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1741	if (vma->rb_subtree_gap < length)
1742		goto check_highest;
1743
1744	while (true) {
1745		/* Visit left subtree if it looks promising */
1746		gap_end = vma->vm_start;
1747		if (gap_end >= low_limit && vma->vm_rb.rb_left) {
1748			struct vm_area_struct *left =
1749				rb_entry(vma->vm_rb.rb_left,
1750					 struct vm_area_struct, vm_rb);
1751			if (left->rb_subtree_gap >= length) {
1752				vma = left;
1753				continue;
1754			}
1755		}
1756
1757		gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
1758check_current:
1759		/* Check if current node has a suitable gap */
1760		if (gap_start > high_limit)
1761			return -ENOMEM;
1762		if (gap_end >= low_limit && gap_end - gap_start >= length)
1763			goto found;
1764
1765		/* Visit right subtree if it looks promising */
1766		if (vma->vm_rb.rb_right) {
1767			struct vm_area_struct *right =
1768				rb_entry(vma->vm_rb.rb_right,
1769					 struct vm_area_struct, vm_rb);
1770			if (right->rb_subtree_gap >= length) {
1771				vma = right;
1772				continue;
1773			}
1774		}
1775
1776		/* Go back up the rbtree to find next candidate node */
1777		while (true) {
1778			struct rb_node *prev = &vma->vm_rb;
1779			if (!rb_parent(prev))
1780				goto check_highest;
1781			vma = rb_entry(rb_parent(prev),
1782				       struct vm_area_struct, vm_rb);
1783			if (prev == vma->vm_rb.rb_left) {
1784				gap_start = vma->vm_prev->vm_end;
1785				gap_end = vma->vm_start;
1786				goto check_current;
1787			}
1788		}
1789	}
1790
1791check_highest:
1792	/* Check highest gap, which does not precede any rbtree node */
1793	gap_start = mm->highest_vm_end;
1794	gap_end = ULONG_MAX;  /* Only for VM_BUG_ON below */
1795	if (gap_start > high_limit)
1796		return -ENOMEM;
1797
1798found:
1799	/* We found a suitable gap. Clip it with the original low_limit. */
1800	if (gap_start < info->low_limit)
1801		gap_start = info->low_limit;
1802
1803	/* Adjust gap address to the desired alignment */
1804	gap_start += (info->align_offset - gap_start) & info->align_mask;
1805
1806	VM_BUG_ON(gap_start + info->length > info->high_limit);
1807	VM_BUG_ON(gap_start + info->length > gap_end);
1808	return gap_start;
1809}
1810
1811unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
1812{
1813	struct mm_struct *mm = current->mm;
1814	struct vm_area_struct *vma;
1815	unsigned long length, low_limit, high_limit, gap_start, gap_end;
1816
1817	/* Adjust search length to account for worst case alignment overhead */
1818	length = info->length + info->align_mask;
1819	if (length < info->length)
1820		return -ENOMEM;
1821
1822	/*
1823	 * Adjust search limits by the desired length.
1824	 * See implementation comment at top of unmapped_area().
1825	 */
1826	gap_end = info->high_limit;
1827	if (gap_end < length)
1828		return -ENOMEM;
1829	high_limit = gap_end - length;
1830
1831	if (info->low_limit > high_limit)
1832		return -ENOMEM;
1833	low_limit = info->low_limit + length;
1834
1835	/* Check highest gap, which does not precede any rbtree node */
1836	gap_start = mm->highest_vm_end;
1837	if (gap_start <= high_limit)
1838		goto found_highest;
1839
1840	/* Check if rbtree root looks promising */
1841	if (RB_EMPTY_ROOT(&mm->mm_rb))
1842		return -ENOMEM;
1843	vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1844	if (vma->rb_subtree_gap < length)
1845		return -ENOMEM;
1846
1847	while (true) {
1848		/* Visit right subtree if it looks promising */
1849		gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
1850		if (gap_start <= high_limit && vma->vm_rb.rb_right) {
1851			struct vm_area_struct *right =
1852				rb_entry(vma->vm_rb.rb_right,
1853					 struct vm_area_struct, vm_rb);
1854			if (right->rb_subtree_gap >= length) {
1855				vma = right;
1856				continue;
1857			}
1858		}
1859
1860check_current:
1861		/* Check if current node has a suitable gap */
1862		gap_end = vma->vm_start;
1863		if (gap_end < low_limit)
1864			return -ENOMEM;
1865		if (gap_start <= high_limit && gap_end - gap_start >= length)
1866			goto found;
1867
1868		/* Visit left subtree if it looks promising */
1869		if (vma->vm_rb.rb_left) {
1870			struct vm_area_struct *left =
1871				rb_entry(vma->vm_rb.rb_left,
1872					 struct vm_area_struct, vm_rb);
1873			if (left->rb_subtree_gap >= length) {
1874				vma = left;
1875				continue;
1876			}
1877		}
1878
1879		/* Go back up the rbtree to find next candidate node */
1880		while (true) {
1881			struct rb_node *prev = &vma->vm_rb;
1882			if (!rb_parent(prev))
1883				return -ENOMEM;
1884			vma = rb_entry(rb_parent(prev),
1885				       struct vm_area_struct, vm_rb);
1886			if (prev == vma->vm_rb.rb_right) {
1887				gap_start = vma->vm_prev ?
1888					vma->vm_prev->vm_end : 0;
1889				goto check_current;
1890			}
1891		}
1892	}
1893
1894found:
1895	/* We found a suitable gap. Clip it with the original high_limit. */
1896	if (gap_end > info->high_limit)
1897		gap_end = info->high_limit;
1898
1899found_highest:
1900	/* Compute highest gap address at the desired alignment */
1901	gap_end -= info->length;
1902	gap_end -= (gap_end - info->align_offset) & info->align_mask;
1903
1904	VM_BUG_ON(gap_end < info->low_limit);
1905	VM_BUG_ON(gap_end < gap_start);
1906	return gap_end;
1907}
1908
1909/* Get an address range which is currently unmapped.
1910 * For shmat() with addr=0.
1911 *
1912 * Ugly calling convention alert:
1913 * Return value with the low bits set means error value,
1914 * ie
1915 *	if (ret & ~PAGE_MASK)
1916 *		error = ret;
1917 *
1918 * This function "knows" that -ENOMEM has the bits set.
1919 */
1920#ifndef HAVE_ARCH_UNMAPPED_AREA
1921unsigned long
1922arch_get_unmapped_area(struct file *filp, unsigned long addr,
1923		unsigned long len, unsigned long pgoff, unsigned long flags)
1924{
1925	struct mm_struct *mm = current->mm;
1926	struct vm_area_struct *vma;
1927	struct vm_unmapped_area_info info;
1928
1929	if (len > TASK_SIZE - mmap_min_addr)
1930		return -ENOMEM;
1931
1932	if (flags & MAP_FIXED)
1933		return addr;
1934
1935	if (addr) {
1936		addr = PAGE_ALIGN(addr);
1937		vma = find_vma(mm, addr);
1938		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
1939		    (!vma || addr + len <= vma->vm_start))
1940			return addr;
1941	}
1942
1943	info.flags = 0;
1944	info.length = len;
1945	info.low_limit = mm->mmap_base;
1946	info.high_limit = TASK_SIZE;
1947	info.align_mask = 0;
1948	return vm_unmapped_area(&info);
1949}
1950#endif
1951
1952/*
1953 * This mmap-allocator allocates new areas top-down from below the
1954 * stack's low limit (the base):
1955 */
1956#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
1957unsigned long
1958arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1959			  const unsigned long len, const unsigned long pgoff,
1960			  const unsigned long flags)
1961{
1962	struct vm_area_struct *vma;
1963	struct mm_struct *mm = current->mm;
1964	unsigned long addr = addr0;
1965	struct vm_unmapped_area_info info;
1966
1967	/* requested length too big for entire address space */
1968	if (len > TASK_SIZE - mmap_min_addr)
1969		return -ENOMEM;
1970
1971	if (flags & MAP_FIXED)
1972		return addr;
1973
1974	/* requesting a specific address */
1975	if (addr) {
1976		addr = PAGE_ALIGN(addr);
1977		vma = find_vma(mm, addr);
1978		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
1979				(!vma || addr + len <= vma->vm_start))
1980			return addr;
1981	}
1982
1983	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
1984	info.length = len;
1985	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
1986	info.high_limit = mm->mmap_base;
1987	info.align_mask = 0;
1988	addr = vm_unmapped_area(&info);
1989
1990	/*
1991	 * A failed mmap() very likely causes application failure,
1992	 * so fall back to the bottom-up function here. This scenario
1993	 * can happen with large stack limits and large mmap()
1994	 * allocations.
1995	 */
1996	if (addr & ~PAGE_MASK) {
1997		VM_BUG_ON(addr != -ENOMEM);
1998		info.flags = 0;
1999		info.low_limit = TASK_UNMAPPED_BASE;
2000		info.high_limit = TASK_SIZE;
2001		addr = vm_unmapped_area(&info);
2002	}
2003
2004	return addr;
2005}
2006#endif
2007
2008unsigned long
2009get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
2010		unsigned long pgoff, unsigned long flags)
2011{
2012	unsigned long (*get_area)(struct file *, unsigned long,
2013				  unsigned long, unsigned long, unsigned long);
2014
2015	unsigned long error = arch_mmap_check(addr, len, flags);
2016	if (error)
2017		return error;
2018
2019	/* Careful about overflows.. */
2020	if (len > TASK_SIZE)
2021		return -ENOMEM;
2022
2023	get_area = current->mm->get_unmapped_area;
2024	if (file && file->f_op->get_unmapped_area)
2025		get_area = file->f_op->get_unmapped_area;
2026	addr = get_area(file, addr, len, pgoff, flags);
2027	if (IS_ERR_VALUE(addr))
2028		return addr;
2029
2030	if (addr > TASK_SIZE - len)
2031		return -ENOMEM;
2032	if (addr & ~PAGE_MASK)
2033		return -EINVAL;
2034
2035	addr = arch_rebalance_pgtables(addr, len);
2036	error = security_mmap_addr(addr);
2037	return error ? error : addr;
2038}
2039
2040EXPORT_SYMBOL(get_unmapped_area);
2041
2042/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
2043struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
2044{
2045	struct rb_node *rb_node;
2046	struct vm_area_struct *vma;
2047
2048	/* Check the cache first. */
2049	vma = vmacache_find(mm, addr);
2050	if (likely(vma))
2051		return vma;
2052
2053	rb_node = mm->mm_rb.rb_node;
2054	vma = NULL;
2055
2056	while (rb_node) {
2057		struct vm_area_struct *tmp;
2058
2059		tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
2060
2061		if (tmp->vm_end > addr) {
2062			vma = tmp;
2063			if (tmp->vm_start <= addr)
2064				break;
2065			rb_node = rb_node->rb_left;
2066		} else
2067			rb_node = rb_node->rb_right;
2068	}
2069
2070	if (vma)
2071		vmacache_update(addr, vma);
2072	return vma;
2073}
2074
2075EXPORT_SYMBOL(find_vma);
2076
2077/*
2078 * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
2079 */
2080struct vm_area_struct *
2081find_vma_prev(struct mm_struct *mm, unsigned long addr,
2082			struct vm_area_struct **pprev)
2083{
2084	struct vm_area_struct *vma;
2085
2086	vma = find_vma(mm, addr);
2087	if (vma) {
2088		*pprev = vma->vm_prev;
2089	} else {
2090		struct rb_node *rb_node = mm->mm_rb.rb_node;
2091		*pprev = NULL;
2092		while (rb_node) {
2093			*pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);
2094			rb_node = rb_node->rb_right;
2095		}
2096	}
2097	return vma;
2098}
2099
2100/*
2101 * Verify that the stack growth is acceptable and
2102 * update accounting. This is shared with both the
2103 * grow-up and grow-down cases.
2104 */
2105static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)
2106{
2107	struct mm_struct *mm = vma->vm_mm;
2108	struct rlimit *rlim = current->signal->rlim;
2109	unsigned long new_start;
2110
2111	/* address space limit tests */
2112	if (!may_expand_vm(mm, grow))
2113		return -ENOMEM;
2114
2115	/* Stack limit test */
2116	if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
2117		return -ENOMEM;
2118
2119	/* mlock limit tests */
2120	if (vma->vm_flags & VM_LOCKED) {
2121		unsigned long locked;
2122		unsigned long limit;
2123		locked = mm->locked_vm + grow;
2124		limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
2125		limit >>= PAGE_SHIFT;
2126		if (locked > limit && !capable(CAP_IPC_LOCK))
2127			return -ENOMEM;
2128	}
2129
2130	/* Check to ensure the stack will not grow into a hugetlb-only region */
2131	new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
2132			vma->vm_end - size;
2133	if (is_hugepage_only_range(vma->vm_mm, new_start, size))
2134		return -EFAULT;
2135
2136	/*
2137	 * Overcommit..  This must be the final test, as it will
2138	 * update security statistics.
2139	 */
2140	if (security_vm_enough_memory_mm(mm, grow))
2141		return -ENOMEM;
2142
2143	/* Ok, everything looks good - let it rip */
2144	if (vma->vm_flags & VM_LOCKED)
2145		mm->locked_vm += grow;
2146	vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
2147	return 0;
2148}
2149
2150#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
2151/*
2152 * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
2153 * vma is the last one with address > vma->vm_end.  Have to extend vma.
2154 */
2155int expand_upwards(struct vm_area_struct *vma, unsigned long address)
2156{
2157	int error;
2158
2159	if (!(vma->vm_flags & VM_GROWSUP))
2160		return -EFAULT;
2161
2162	/*
2163	 * We must make sure the anon_vma is allocated
2164	 * so that the anon_vma locking is not a noop.
2165	 */
2166	if (unlikely(anon_vma_prepare(vma)))
2167		return -ENOMEM;
2168	vma_lock_anon_vma(vma);
2169
2170	/*
2171	 * vma->vm_start/vm_end cannot change under us because the caller
2172	 * is required to hold the mmap_sem in read mode.  We need the
2173	 * anon_vma lock to serialize against concurrent expand_stacks.
2174	 * Also guard against wrapping around to address 0.
2175	 */
2176	if (address < PAGE_ALIGN(address+4))
2177		address = PAGE_ALIGN(address+4);
2178	else {
2179		vma_unlock_anon_vma(vma);
2180		return -ENOMEM;
2181	}
2182	error = 0;
2183
2184	/* Somebody else might have raced and expanded it already */
2185	if (address > vma->vm_end) {
2186		unsigned long size, grow;
2187
2188		size = address - vma->vm_start;
2189		grow = (address - vma->vm_end) >> PAGE_SHIFT;
2190
2191		error = -ENOMEM;
2192		if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
2193			error = acct_stack_growth(vma, size, grow);
2194			if (!error) {
2195				/*
2196				 * vma_gap_update() doesn't support concurrent
2197				 * updates, but we only hold a shared mmap_sem
2198				 * lock here, so we need to protect against
2199				 * concurrent vma expansions.
2200				 * vma_lock_anon_vma() doesn't help here, as
2201				 * we don't guarantee that all growable vmas
2202				 * in a mm share the same root anon vma.
2203				 * So, we reuse mm->page_table_lock to guard
2204				 * against concurrent vma expansions.
2205				 */
2206				spin_lock(&vma->vm_mm->page_table_lock);
2207				anon_vma_interval_tree_pre_update_vma(vma);
2208				vma->vm_end = address;
2209				anon_vma_interval_tree_post_update_vma(vma);
2210				if (vma->vm_next)
2211					vma_gap_update(vma->vm_next);
2212				else
2213					vma->vm_mm->highest_vm_end = address;
2214				spin_unlock(&vma->vm_mm->page_table_lock);
2215
2216				perf_event_mmap(vma);
2217			}
2218		}
2219	}
2220	vma_unlock_anon_vma(vma);
2221	khugepaged_enter_vma_merge(vma, vma->vm_flags);
2222	validate_mm(vma->vm_mm);
2223	return error;
2224}
2225#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
2226
2227/*
2228 * vma is the first one with address < vma->vm_start.  Have to extend vma.
2229 */
2230int expand_downwards(struct vm_area_struct *vma,
2231				   unsigned long address)
2232{
2233	int error;
2234
2235	/*
2236	 * We must make sure the anon_vma is allocated
2237	 * so that the anon_vma locking is not a noop.
2238	 */
2239	if (unlikely(anon_vma_prepare(vma)))
2240		return -ENOMEM;
2241
2242	address &= PAGE_MASK;
2243	error = security_mmap_addr(address);
2244	if (error)
2245		return error;
2246
2247	vma_lock_anon_vma(vma);
2248
2249	/*
2250	 * vma->vm_start/vm_end cannot change under us because the caller
2251	 * is required to hold the mmap_sem in read mode.  We need the
2252	 * anon_vma lock to serialize against concurrent expand_stacks.
2253	 */
2254
2255	/* Somebody else might have raced and expanded it already */
2256	if (address < vma->vm_start) {
2257		unsigned long size, grow;
2258
2259		size = vma->vm_end - address;
2260		grow = (vma->vm_start - address) >> PAGE_SHIFT;
2261
2262		error = -ENOMEM;
2263		if (grow <= vma->vm_pgoff) {
2264			error = acct_stack_growth(vma, size, grow);
2265			if (!error) {
2266				/*
2267				 * vma_gap_update() doesn't support concurrent
2268				 * updates, but we only hold a shared mmap_sem
2269				 * lock here, so we need to protect against
2270				 * concurrent vma expansions.
2271				 * vma_lock_anon_vma() doesn't help here, as
2272				 * we don't guarantee that all growable vmas
2273				 * in a mm share the same root anon vma.
2274				 * So, we reuse mm->page_table_lock to guard
2275				 * against concurrent vma expansions.
2276				 */
2277				spin_lock(&vma->vm_mm->page_table_lock);
2278				anon_vma_interval_tree_pre_update_vma(vma);
2279				vma->vm_start = address;
2280				vma->vm_pgoff -= grow;
2281				anon_vma_interval_tree_post_update_vma(vma);
2282				vma_gap_update(vma);
2283				spin_unlock(&vma->vm_mm->page_table_lock);
2284
2285				perf_event_mmap(vma);
2286			}
2287		}
2288	}
2289	vma_unlock_anon_vma(vma);
2290	khugepaged_enter_vma_merge(vma, vma->vm_flags);
2291	validate_mm(vma->vm_mm);
2292	return error;
2293}
2294
2295/*
2296 * Note how expand_stack() refuses to expand the stack all the way to
2297 * abut the next virtual mapping, *unless* that mapping itself is also
2298 * a stack mapping. We want to leave room for a guard page, after all
2299 * (the guard page itself is not added here, that is done by the
2300 * actual page faulting logic)
2301 *
2302 * This matches the behavior of the guard page logic (see mm/memory.c:
2303 * check_stack_guard_page()), which only allows the guard page to be
2304 * removed under these circumstances.
2305 */
2306#ifdef CONFIG_STACK_GROWSUP
2307int expand_stack(struct vm_area_struct *vma, unsigned long address)
2308{
2309	struct vm_area_struct *next;
2310
2311	address &= PAGE_MASK;
2312	next = vma->vm_next;
2313	if (next && next->vm_start == address + PAGE_SIZE) {
2314		if (!(next->vm_flags & VM_GROWSUP))
2315			return -ENOMEM;
2316	}
2317	return expand_upwards(vma, address);
2318}
2319
2320struct vm_area_struct *
2321find_extend_vma(struct mm_struct *mm, unsigned long addr)
2322{
2323	struct vm_area_struct *vma, *prev;
2324
2325	addr &= PAGE_MASK;
2326	vma = find_vma_prev(mm, addr, &prev);
2327	if (vma && (vma->vm_start <= addr))
2328		return vma;
2329	if (!prev || expand_stack(prev, addr))
2330		return NULL;
2331	if (prev->vm_flags & VM_LOCKED)
2332		__mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);
2333	return prev;
2334}
2335#else
2336int expand_stack(struct vm_area_struct *vma, unsigned long address)
2337{
2338	struct vm_area_struct *prev;
2339
2340	address &= PAGE_MASK;
2341	prev = vma->vm_prev;
2342	if (prev && prev->vm_end == address) {
2343		if (!(prev->vm_flags & VM_GROWSDOWN))
2344			return -ENOMEM;
2345	}
2346	return expand_downwards(vma, address);
2347}
2348
2349struct vm_area_struct *
2350find_extend_vma(struct mm_struct *mm, unsigned long addr)
2351{
2352	struct vm_area_struct *vma;
2353	unsigned long start;
2354
2355	addr &= PAGE_MASK;
2356	vma = find_vma(mm, addr);
2357	if (!vma)
2358		return NULL;
2359	if (vma->vm_start <= addr)
2360		return vma;
2361	if (!(vma->vm_flags & VM_GROWSDOWN))
2362		return NULL;
2363	start = vma->vm_start;
2364	if (expand_stack(vma, addr))
2365		return NULL;
2366	if (vma->vm_flags & VM_LOCKED)
2367		__mlock_vma_pages_range(vma, addr, start, NULL);
2368	return vma;
2369}
2370#endif
2371
2372/*
2373 * Ok - we have the memory areas we should free on the vma list,
2374 * so release them, and do the vma updates.
2375 *
2376 * Called with the mm semaphore held.
2377 */
2378static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
2379{
2380	unsigned long nr_accounted = 0;
2381
2382	/* Update high watermark before we lower total_vm */
2383	update_hiwater_vm(mm);
2384	do {
2385		long nrpages = vma_pages(vma);
2386
2387		if (vma->vm_flags & VM_ACCOUNT)
2388			nr_accounted += nrpages;
2389		vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
2390		vma = remove_vma(vma);
2391	} while (vma);
2392	vm_unacct_memory(nr_accounted);
2393	validate_mm(mm);
2394}
2395
2396/*
2397 * Get rid of page table information in the indicated region.
2398 *
2399 * Called with the mm semaphore held.
2400 */
2401static void unmap_region(struct mm_struct *mm,
2402		struct vm_area_struct *vma, struct vm_area_struct *prev,
2403		unsigned long start, unsigned long end)
2404{
2405	struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
2406	struct mmu_gather tlb;
2407
2408	lru_add_drain();
2409	tlb_gather_mmu(&tlb, mm, start, end);
2410	update_hiwater_rss(mm);
2411	unmap_vmas(&tlb, vma, start, end);
2412	free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
2413				 next ? next->vm_start : USER_PGTABLES_CEILING);
2414	tlb_finish_mmu(&tlb, start, end);
2415}
2416
2417/*
2418 * Create a list of vma's touched by the unmap, removing them from the mm's
2419 * vma list as we go..
2420 */
2421static void
2422detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
2423	struct vm_area_struct *prev, unsigned long end)
2424{
2425	struct vm_area_struct **insertion_point;
2426	struct vm_area_struct *tail_vma = NULL;
2427
2428	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
2429	vma->vm_prev = NULL;
2430	do {
2431		vma_rb_erase(vma, &mm->mm_rb);
2432		mm->map_count--;
2433		tail_vma = vma;
2434		vma = vma->vm_next;
2435	} while (vma && vma->vm_start < end);
2436	*insertion_point = vma;
2437	if (vma) {
2438		vma->vm_prev = prev;
2439		vma_gap_update(vma);
2440	} else
2441		mm->highest_vm_end = prev ? prev->vm_end : 0;
2442	tail_vma->vm_next = NULL;
2443
2444	/* Kill the cache */
2445	vmacache_invalidate(mm);
2446}
2447
2448/*
2449 * __split_vma() bypasses sysctl_max_map_count checking.  We use this on the
2450 * munmap path where it doesn't make sense to fail.
2451 */
2452static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
2453	      unsigned long addr, int new_below)
2454{
2455	struct vm_area_struct *new;
2456	int err = -ENOMEM;
2457
2458	if (is_vm_hugetlb_page(vma) && (addr &
2459					~(huge_page_mask(hstate_vma(vma)))))
2460		return -EINVAL;
2461
2462	new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2463	if (!new)
2464		goto out_err;
2465
2466	/* most fields are the same, copy all, and then fixup */
2467	*new = *vma;
2468
2469	INIT_LIST_HEAD(&new->anon_vma_chain);
2470
2471	if (new_below)
2472		new->vm_end = addr;
2473	else {
2474		new->vm_start = addr;
2475		new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
2476	}
2477
2478	err = vma_dup_policy(vma, new);
2479	if (err)
2480		goto out_free_vma;
2481
2482	err = anon_vma_clone(new, vma);
2483	if (err)
2484		goto out_free_mpol;
2485
2486	if (new->vm_file)
2487		get_file(new->vm_file);
2488
2489	if (new->vm_ops && new->vm_ops->open)
2490		new->vm_ops->open(new);
2491
2492	if (new_below)
2493		err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
2494			((addr - new->vm_start) >> PAGE_SHIFT), new);
2495	else
2496		err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
2497
2498	/* Success. */
2499	if (!err)
2500		return 0;
2501
2502	/* Clean everything up if vma_adjust failed. */
2503	if (new->vm_ops && new->vm_ops->close)
2504		new->vm_ops->close(new);
2505	if (new->vm_file)
2506		fput(new->vm_file);
2507	unlink_anon_vmas(new);
2508 out_free_mpol:
2509	mpol_put(vma_policy(new));
2510 out_free_vma:
2511	kmem_cache_free(vm_area_cachep, new);
2512 out_err:
2513	return err;
2514}
2515
2516/*
2517 * Split a vma into two pieces at address 'addr', a new vma is allocated
2518 * either for the first part or the tail.
2519 */
2520int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
2521	      unsigned long addr, int new_below)
2522{
2523	if (mm->map_count >= sysctl_max_map_count)
2524		return -ENOMEM;
2525
2526	return __split_vma(mm, vma, addr, new_below);
2527}
2528
2529/* Munmap is split into 2 main parts -- this part which finds
2530 * what needs doing, and the areas themselves, which do the
2531 * work.  This now handles partial unmappings.
2532 * Jeremy Fitzhardinge <jeremy@goop.org>
2533 */
2534int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
2535{
2536	unsigned long end;
2537	struct vm_area_struct *vma, *prev, *last;
2538
2539	if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
2540		return -EINVAL;
2541
2542	len = PAGE_ALIGN(len);
2543	if (len == 0)
2544		return -EINVAL;
2545
2546	/* Find the first overlapping VMA */
2547	vma = find_vma(mm, start);
2548	if (!vma)
2549		return 0;
2550	prev = vma->vm_prev;
2551	/* we have  start < vma->vm_end  */
2552
2553	/* if it doesn't overlap, we have nothing.. */
2554	end = start + len;
2555	if (vma->vm_start >= end)
2556		return 0;
2557
2558	/*
2559	 * If we need to split any vma, do it now to save pain later.
2560	 *
2561	 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
2562	 * unmapped vm_area_struct will remain in use: so lower split_vma
2563	 * places tmp vma above, and higher split_vma places tmp vma below.
2564	 */
2565	if (start > vma->vm_start) {
2566		int error;
2567
2568		/*
2569		 * Make sure that map_count on return from munmap() will
2570		 * not exceed its limit; but let map_count go just above
2571		 * its limit temporarily, to help free resources as expected.
2572		 */
2573		if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
2574			return -ENOMEM;
2575
2576		error = __split_vma(mm, vma, start, 0);
2577		if (error)
2578			return error;
2579		prev = vma;
2580	}
2581
2582	/* Does it split the last one? */
2583	last = find_vma(mm, end);
2584	if (last && end > last->vm_start) {
2585		int error = __split_vma(mm, last, end, 1);
2586		if (error)
2587			return error;
2588	}
2589	vma = prev ? prev->vm_next : mm->mmap;
2590
2591	/*
2592	 * unlock any mlock()ed ranges before detaching vmas
2593	 */
2594	if (mm->locked_vm) {
2595		struct vm_area_struct *tmp = vma;
2596		while (tmp && tmp->vm_start < end) {
2597			if (tmp->vm_flags & VM_LOCKED) {
2598				mm->locked_vm -= vma_pages(tmp);
2599				munlock_vma_pages_all(tmp);
2600			}
2601			tmp = tmp->vm_next;
2602		}
2603	}
2604
2605	/*
2606	 * Remove the vma's, and unmap the actual pages
2607	 */
2608	detach_vmas_to_be_unmapped(mm, vma, prev, end);
2609	unmap_region(mm, vma, prev, start, end);
2610
2611	/* Fix up all other VM information */
2612	remove_vma_list(mm, vma);
2613
2614	return 0;
2615}
2616
2617int vm_munmap(unsigned long start, size_t len)
2618{
2619	int ret;
2620	struct mm_struct *mm = current->mm;
2621
2622	down_write(&mm->mmap_sem);
2623	ret = do_munmap(mm, start, len);
2624	up_write(&mm->mmap_sem);
2625	return ret;
2626}
2627EXPORT_SYMBOL(vm_munmap);
2628
2629SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
2630{
2631	profile_munmap(addr);
2632	return vm_munmap(addr, len);
2633}
2634
2635static inline void verify_mm_writelocked(struct mm_struct *mm)
2636{
2637#ifdef CONFIG_DEBUG_VM
2638	if (unlikely(down_read_trylock(&mm->mmap_sem))) {
2639		WARN_ON(1);
2640		up_read(&mm->mmap_sem);
2641	}
2642#endif
2643}
2644
2645/*
2646 *  this is really a simplified "do_mmap".  it only handles
2647 *  anonymous maps.  eventually we may be able to do some
2648 *  brk-specific accounting here.
2649 */
2650static unsigned long do_brk(unsigned long addr, unsigned long len)
2651{
2652	struct mm_struct *mm = current->mm;
2653	struct vm_area_struct *vma, *prev;
2654	unsigned long flags;
2655	struct rb_node **rb_link, *rb_parent;
2656	pgoff_t pgoff = addr >> PAGE_SHIFT;
2657	int error;
2658
2659	len = PAGE_ALIGN(len);
2660	if (!len)
2661		return addr;
2662
2663	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
2664
2665	error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
2666	if (error & ~PAGE_MASK)
2667		return error;
2668
2669	error = mlock_future_check(mm, mm->def_flags, len);
2670	if (error)
2671		return error;
2672
2673	/*
2674	 * mm->mmap_sem is required to protect against another thread
2675	 * changing the mappings in case we sleep.
2676	 */
2677	verify_mm_writelocked(mm);
2678
2679	/*
2680	 * Clear old maps.  this also does some error checking for us
2681	 */
2682 munmap_back:
2683	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
2684		if (do_munmap(mm, addr, len))
2685			return -ENOMEM;
2686		goto munmap_back;
2687	}
2688
2689	/* Check against address space limits *after* clearing old maps... */
2690	if (!may_expand_vm(mm, len >> PAGE_SHIFT))
2691		return -ENOMEM;
2692
2693	if (mm->map_count > sysctl_max_map_count)
2694		return -ENOMEM;
2695
2696	if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
2697		return -ENOMEM;
2698
2699	/* Can we just expand an old private anonymous mapping? */
2700	vma = vma_merge(mm, prev, addr, addr + len, flags,
2701					NULL, NULL, pgoff, NULL, NULL);
2702	if (vma)
2703		goto out;
2704
2705	/*
2706	 * create a vma struct for an anonymous mapping
2707	 */
2708	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
2709	if (!vma) {
2710		vm_unacct_memory(len >> PAGE_SHIFT);
2711		return -ENOMEM;
2712	}
2713
2714	INIT_LIST_HEAD(&vma->anon_vma_chain);
2715	vma->vm_mm = mm;
2716	vma->vm_start = addr;
2717	vma->vm_end = addr + len;
2718	vma->vm_pgoff = pgoff;
2719	vma->vm_flags = flags;
2720	vma->vm_page_prot = vm_get_page_prot(flags);
2721	vma_link(mm, vma, prev, rb_link, rb_parent);
2722out:
2723	perf_event_mmap(vma);
2724	mm->total_vm += len >> PAGE_SHIFT;
2725	if (flags & VM_LOCKED)
2726		mm->locked_vm += (len >> PAGE_SHIFT);
2727	vma->vm_flags |= VM_SOFTDIRTY;
2728	return addr;
2729}
2730
2731unsigned long vm_brk(unsigned long addr, unsigned long len)
2732{
2733	struct mm_struct *mm = current->mm;
2734	unsigned long ret;
2735	bool populate;
2736
2737	down_write(&mm->mmap_sem);
2738	ret = do_brk(addr, len);
2739	populate = ((mm->def_flags & VM_LOCKED) != 0);
2740	up_write(&mm->mmap_sem);
2741	if (populate)
2742		mm_populate(addr, len);
2743	return ret;
2744}
2745EXPORT_SYMBOL(vm_brk);
2746
2747/* Release all mmaps. */
2748void exit_mmap(struct mm_struct *mm)
2749{
2750	struct mmu_gather tlb;
2751	struct vm_area_struct *vma;
2752	unsigned long nr_accounted = 0;
2753
2754	/* mm's last user has gone, and its about to be pulled down */
2755	mmu_notifier_release(mm);
2756
2757	if (mm->locked_vm) {
2758		vma = mm->mmap;
2759		while (vma) {
2760			if (vma->vm_flags & VM_LOCKED)
2761				munlock_vma_pages_all(vma);
2762			vma = vma->vm_next;
2763		}
2764	}
2765
2766	arch_exit_mmap(mm);
2767
2768	vma = mm->mmap;
2769	if (!vma)	/* Can happen if dup_mmap() received an OOM */
2770		return;
2771
2772	lru_add_drain();
2773	flush_cache_mm(mm);
2774	tlb_gather_mmu(&tlb, mm, 0, -1);
2775	/* update_hiwater_rss(mm) here? but nobody should be looking */
2776	/* Use -1 here to ensure all VMAs in the mm are unmapped */
2777	unmap_vmas(&tlb, vma, 0, -1);
2778
2779	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
2780	tlb_finish_mmu(&tlb, 0, -1);
2781
2782	/*
2783	 * Walk the list again, actually closing and freeing it,
2784	 * with preemption enabled, without holding any MM locks.
2785	 */
2786	while (vma) {
2787		if (vma->vm_flags & VM_ACCOUNT)
2788			nr_accounted += vma_pages(vma);
2789		vma = remove_vma(vma);
2790	}
2791	vm_unacct_memory(nr_accounted);
2792
2793	WARN_ON(atomic_long_read(&mm->nr_ptes) >
2794			(FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
2795}
2796
2797/* Insert vm structure into process list sorted by address
2798 * and into the inode's i_mmap tree.  If vm_file is non-NULL
2799 * then i_mmap_mutex is taken here.
2800 */
2801int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
2802{
2803	struct vm_area_struct *prev;
2804	struct rb_node **rb_link, *rb_parent;
2805
2806	/*
2807	 * The vm_pgoff of a purely anonymous vma should be irrelevant
2808	 * until its first write fault, when page's anon_vma and index
2809	 * are set.  But now set the vm_pgoff it will almost certainly
2810	 * end up with (unless mremap moves it elsewhere before that
2811	 * first wfault), so /proc/pid/maps tells a consistent story.
2812	 *
2813	 * By setting it to reflect the virtual start address of the
2814	 * vma, merges and splits can happen in a seamless way, just
2815	 * using the existing file pgoff checks and manipulations.
2816	 * Similarly in do_mmap_pgoff and in do_brk.
2817	 */
2818	if (!vma->vm_file) {
2819		BUG_ON(vma->anon_vma);
2820		vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
2821	}
2822	if (find_vma_links(mm, vma->vm_start, vma->vm_end,
2823			   &prev, &rb_link, &rb_parent))
2824		return -ENOMEM;
2825	if ((vma->vm_flags & VM_ACCOUNT) &&
2826	     security_vm_enough_memory_mm(mm, vma_pages(vma)))
2827		return -ENOMEM;
2828
2829	vma_link(mm, vma, prev, rb_link, rb_parent);
2830	return 0;
2831}
2832
2833/*
2834 * Copy the vma structure to a new location in the same mm,
2835 * prior to moving page table entries, to effect an mremap move.
2836 */
2837struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2838	unsigned long addr, unsigned long len, pgoff_t pgoff,
2839	bool *need_rmap_locks)
2840{
2841	struct vm_area_struct *vma = *vmap;
2842	unsigned long vma_start = vma->vm_start;
2843	struct mm_struct *mm = vma->vm_mm;
2844	struct vm_area_struct *new_vma, *prev;
2845	struct rb_node **rb_link, *rb_parent;
2846	bool faulted_in_anon_vma = true;
2847
2848	/*
2849	 * If anonymous vma has not yet been faulted, update new pgoff
2850	 * to match new location, to increase its chance of merging.
2851	 */
2852	if (unlikely(!vma->vm_file && !vma->anon_vma)) {
2853		pgoff = addr >> PAGE_SHIFT;
2854		faulted_in_anon_vma = false;
2855	}
2856
2857	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
2858		return NULL;	/* should never get here */
2859	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
2860			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
2861			vma_get_anon_name(vma));
2862	if (new_vma) {
2863		/*
2864		 * Source vma may have been merged into new_vma
2865		 */
2866		if (unlikely(vma_start >= new_vma->vm_start &&
2867			     vma_start < new_vma->vm_end)) {
2868			/*
2869			 * The only way we can get a vma_merge with
2870			 * self during an mremap is if the vma hasn't
2871			 * been faulted in yet and we were allowed to
2872			 * reset the dst vma->vm_pgoff to the
2873			 * destination address of the mremap to allow
2874			 * the merge to happen. mremap must change the
2875			 * vm_pgoff linearity between src and dst vmas
2876			 * (in turn preventing a vma_merge) to be
2877			 * safe. It is only safe to keep the vm_pgoff
2878			 * linear if there are no pages mapped yet.
2879			 */
2880			VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
2881			*vmap = vma = new_vma;
2882		}
2883		*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
2884	} else {
2885		new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2886		if (new_vma) {
2887			*new_vma = *vma;
2888			new_vma->vm_start = addr;
2889			new_vma->vm_end = addr + len;
2890			new_vma->vm_pgoff = pgoff;
2891			if (vma_dup_policy(vma, new_vma))
2892				goto out_free_vma;
2893			INIT_LIST_HEAD(&new_vma->anon_vma_chain);
2894			if (anon_vma_clone(new_vma, vma))
2895				goto out_free_mempol;
2896			if (new_vma->vm_file)
2897				get_file(new_vma->vm_file);
2898			if (new_vma->vm_ops && new_vma->vm_ops->open)
2899				new_vma->vm_ops->open(new_vma);
2900			vma_link(mm, new_vma, prev, rb_link, rb_parent);
2901			*need_rmap_locks = false;
2902		}
2903	}
2904	return new_vma;
2905
2906 out_free_mempol:
2907	mpol_put(vma_policy(new_vma));
2908 out_free_vma:
2909	kmem_cache_free(vm_area_cachep, new_vma);
2910	return NULL;
2911}
2912
2913/*
2914 * Return true if the calling process may expand its vm space by the passed
2915 * number of pages
2916 */
2917int may_expand_vm(struct mm_struct *mm, unsigned long npages)
2918{
2919	unsigned long cur = mm->total_vm;	/* pages */
2920	unsigned long lim;
2921
2922	lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT;
2923
2924	if (cur + npages > lim)
2925		return 0;
2926	return 1;
2927}
2928
2929static int special_mapping_fault(struct vm_area_struct *vma,
2930				 struct vm_fault *vmf);
2931
2932/*
2933 * Having a close hook prevents vma merging regardless of flags.
2934 */
2935static void special_mapping_close(struct vm_area_struct *vma)
2936{
2937}
2938
2939static const char *special_mapping_name(struct vm_area_struct *vma)
2940{
2941	return ((struct vm_special_mapping *)vma->vm_private_data)->name;
2942}
2943
2944static const struct vm_operations_struct special_mapping_vmops = {
2945	.close = special_mapping_close,
2946	.fault = special_mapping_fault,
2947	.name = special_mapping_name,
2948};
2949
2950static const struct vm_operations_struct legacy_special_mapping_vmops = {
2951	.close = special_mapping_close,
2952	.fault = special_mapping_fault,
2953};
2954
2955static int special_mapping_fault(struct vm_area_struct *vma,
2956				struct vm_fault *vmf)
2957{
2958	pgoff_t pgoff;
2959	struct page **pages;
2960
2961	/*
2962	 * special mappings have no vm_file, and in that case, the mm
2963	 * uses vm_pgoff internally. So we have to subtract it from here.
2964	 * We are allowed to do this because we are the mm; do not copy
2965	 * this code into drivers!
2966	 */
2967	pgoff = vmf->pgoff - vma->vm_pgoff;
2968
2969	if (vma->vm_ops == &legacy_special_mapping_vmops)
2970		pages = vma->vm_private_data;
2971	else
2972		pages = ((struct vm_special_mapping *)vma->vm_private_data)->
2973			pages;
2974
2975	for (; pgoff && *pages; ++pages)
2976		pgoff--;
2977
2978	if (*pages) {
2979		struct page *page = *pages;
2980		get_page(page);
2981		vmf->page = page;
2982		return 0;
2983	}
2984
2985	return VM_FAULT_SIGBUS;
2986}
2987
2988static struct vm_area_struct *__install_special_mapping(
2989	struct mm_struct *mm,
2990	unsigned long addr, unsigned long len,
2991	unsigned long vm_flags, const struct vm_operations_struct *ops,
2992	void *priv)
2993{
2994	int ret;
2995	struct vm_area_struct *vma;
2996
2997	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
2998	if (unlikely(vma == NULL))
2999		return ERR_PTR(-ENOMEM);
3000
3001	INIT_LIST_HEAD(&vma->anon_vma_chain);
3002	vma->vm_mm = mm;
3003	vma->vm_start = addr;
3004	vma->vm_end = addr + len;
3005
3006	vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
3007	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
3008
3009	vma->vm_ops = ops;
3010	vma->vm_private_data = priv;
3011
3012	ret = insert_vm_struct(mm, vma);
3013	if (ret)
3014		goto out;
3015
3016	mm->total_vm += len >> PAGE_SHIFT;
3017
3018	perf_event_mmap(vma);
3019
3020	return vma;
3021
3022out:
3023	kmem_cache_free(vm_area_cachep, vma);
3024	return ERR_PTR(ret);
3025}
3026
3027/*
3028 * Called with mm->mmap_sem held for writing.
3029 * Insert a new vma covering the given region, with the given flags.
3030 * Its pages are supplied by the given array of struct page *.
3031 * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
3032 * The region past the last page supplied will always produce SIGBUS.
3033 * The array pointer and the pages it points to are assumed to stay alive
3034 * for as long as this mapping might exist.
3035 */
3036struct vm_area_struct *_install_special_mapping(
3037	struct mm_struct *mm,
3038	unsigned long addr, unsigned long len,
3039	unsigned long vm_flags, const struct vm_special_mapping *spec)
3040{
3041	return __install_special_mapping(mm, addr, len, vm_flags,
3042					 &special_mapping_vmops, (void *)spec);
3043}
3044
3045int install_special_mapping(struct mm_struct *mm,
3046			    unsigned long addr, unsigned long len,
3047			    unsigned long vm_flags, struct page **pages)
3048{
3049	struct vm_area_struct *vma = __install_special_mapping(
3050		mm, addr, len, vm_flags, &legacy_special_mapping_vmops,
3051		(void *)pages);
3052
3053	return PTR_ERR_OR_ZERO(vma);
3054}
3055
3056static DEFINE_MUTEX(mm_all_locks_mutex);
3057
3058static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
3059{
3060	if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
3061		/*
3062		 * The LSB of head.next can't change from under us
3063		 * because we hold the mm_all_locks_mutex.
3064		 */
3065		down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);
3066		/*
3067		 * We can safely modify head.next after taking the
3068		 * anon_vma->root->rwsem. If some other vma in this mm shares
3069		 * the same anon_vma we won't take it again.
3070		 *
3071		 * No need of atomic instructions here, head.next
3072		 * can't change from under us thanks to the
3073		 * anon_vma->root->rwsem.
3074		 */
3075		if (__test_and_set_bit(0, (unsigned long *)
3076				       &anon_vma->root->rb_root.rb_node))
3077			BUG();
3078	}
3079}
3080
3081static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
3082{
3083	if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
3084		/*
3085		 * AS_MM_ALL_LOCKS can't change from under us because
3086		 * we hold the mm_all_locks_mutex.
3087		 *
3088		 * Operations on ->flags have to be atomic because
3089		 * even if AS_MM_ALL_LOCKS is stable thanks to the
3090		 * mm_all_locks_mutex, there may be other cpus
3091		 * changing other bitflags in parallel to us.
3092		 */
3093		if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
3094			BUG();
3095		mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);
3096	}
3097}
3098
3099/*
3100 * This operation locks against the VM for all pte/vma/mm related
3101 * operations that could ever happen on a certain mm. This includes
3102 * vmtruncate, try_to_unmap, and all page faults.
3103 *
3104 * The caller must take the mmap_sem in write mode before calling
3105 * mm_take_all_locks(). The caller isn't allowed to release the
3106 * mmap_sem until mm_drop_all_locks() returns.
3107 *
3108 * mmap_sem in write mode is required in order to block all operations
3109 * that could modify pagetables and free pages without need of
3110 * altering the vma layout (for example populate_range() with
3111 * nonlinear vmas). It's also needed in write mode to avoid new
3112 * anon_vmas to be associated with existing vmas.
3113 *
3114 * A single task can't take more than one mm_take_all_locks() in a row
3115 * or it would deadlock.
3116 *
3117 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
3118 * mapping->flags avoid to take the same lock twice, if more than one
3119 * vma in this mm is backed by the same anon_vma or address_space.
3120 *
3121 * We can take all the locks in random order because the VM code
3122 * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never
3123 * takes more than one of them in a row. Secondly we're protected
3124 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
3125 *
3126 * mm_take_all_locks() and mm_drop_all_locks are expensive operations
3127 * that may have to take thousand of locks.
3128 *
3129 * mm_take_all_locks() can fail if it's interrupted by signals.
3130 */
3131int mm_take_all_locks(struct mm_struct *mm)
3132{
3133	struct vm_area_struct *vma;
3134	struct anon_vma_chain *avc;
3135
3136	BUG_ON(down_read_trylock(&mm->mmap_sem));
3137
3138	mutex_lock(&mm_all_locks_mutex);
3139
3140	for (vma = mm->mmap; vma; vma = vma->vm_next) {
3141		if (signal_pending(current))
3142			goto out_unlock;
3143		if (vma->vm_file && vma->vm_file->f_mapping)
3144			vm_lock_mapping(mm, vma->vm_file->f_mapping);
3145	}
3146
3147	for (vma = mm->mmap; vma; vma = vma->vm_next) {
3148		if (signal_pending(current))
3149			goto out_unlock;
3150		if (vma->anon_vma)
3151			list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
3152				vm_lock_anon_vma(mm, avc->anon_vma);
3153	}
3154
3155	return 0;
3156
3157out_unlock:
3158	mm_drop_all_locks(mm);
3159	return -EINTR;
3160}
3161
3162static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
3163{
3164	if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
3165		/*
3166		 * The LSB of head.next can't change to 0 from under
3167		 * us because we hold the mm_all_locks_mutex.
3168		 *
3169		 * We must however clear the bitflag before unlocking
3170		 * the vma so the users using the anon_vma->rb_root will
3171		 * never see our bitflag.
3172		 *
3173		 * No need of atomic instructions here, head.next
3174		 * can't change from under us until we release the
3175		 * anon_vma->root->rwsem.
3176		 */
3177		if (!__test_and_clear_bit(0, (unsigned long *)
3178					  &anon_vma->root->rb_root.rb_node))
3179			BUG();
3180		anon_vma_unlock_write(anon_vma);
3181	}
3182}
3183
3184static void vm_unlock_mapping(struct address_space *mapping)
3185{
3186	if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
3187		/*
3188		 * AS_MM_ALL_LOCKS can't change to 0 from under us
3189		 * because we hold the mm_all_locks_mutex.
3190		 */
3191		mutex_unlock(&mapping->i_mmap_mutex);
3192		if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
3193					&mapping->flags))
3194			BUG();
3195	}
3196}
3197
3198/*
3199 * The mmap_sem cannot be released by the caller until
3200 * mm_drop_all_locks() returns.
3201 */
3202void mm_drop_all_locks(struct mm_struct *mm)
3203{
3204	struct vm_area_struct *vma;
3205	struct anon_vma_chain *avc;
3206
3207	BUG_ON(down_read_trylock(&mm->mmap_sem));
3208	BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
3209
3210	for (vma = mm->mmap; vma; vma = vma->vm_next) {
3211		if (vma->anon_vma)
3212			list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
3213				vm_unlock_anon_vma(avc->anon_vma);
3214		if (vma->vm_file && vma->vm_file->f_mapping)
3215			vm_unlock_mapping(vma->vm_file->f_mapping);
3216	}
3217
3218	mutex_unlock(&mm_all_locks_mutex);
3219}
3220
3221/*
3222 * initialise the VMA slab
3223 */
3224void __init mmap_init(void)
3225{
3226	int ret;
3227
3228	ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
3229	VM_BUG_ON(ret);
3230}
3231
3232/*
3233 * Initialise sysctl_user_reserve_kbytes.
3234 *
3235 * This is intended to prevent a user from starting a single memory hogging
3236 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
3237 * mode.
3238 *
3239 * The default value is min(3% of free memory, 128MB)
3240 * 128MB is enough to recover with sshd/login, bash, and top/kill.
3241 */
3242static int init_user_reserve(void)
3243{
3244	unsigned long free_kbytes;
3245
3246	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3247
3248	sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
3249	return 0;
3250}
3251subsys_initcall(init_user_reserve);
3252
3253/*
3254 * Initialise sysctl_admin_reserve_kbytes.
3255 *
3256 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
3257 * to log in and kill a memory hogging process.
3258 *
3259 * Systems with more than 256MB will reserve 8MB, enough to recover
3260 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
3261 * only reserve 3% of free pages by default.
3262 */
3263static int init_admin_reserve(void)
3264{
3265	unsigned long free_kbytes;
3266
3267	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3268
3269	sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
3270	return 0;
3271}
3272subsys_initcall(init_admin_reserve);
3273
3274/*
3275 * Reinititalise user and admin reserves if memory is added or removed.
3276 *
3277 * The default user reserve max is 128MB, and the default max for the
3278 * admin reserve is 8MB. These are usually, but not always, enough to
3279 * enable recovery from a memory hogging process using login/sshd, a shell,
3280 * and tools like top. It may make sense to increase or even disable the
3281 * reserve depending on the existence of swap or variations in the recovery
3282 * tools. So, the admin may have changed them.
3283 *
3284 * If memory is added and the reserves have been eliminated or increased above
3285 * the default max, then we'll trust the admin.
3286 *
3287 * If memory is removed and there isn't enough free memory, then we
3288 * need to reset the reserves.
3289 *
3290 * Otherwise keep the reserve set by the admin.
3291 */
3292static int reserve_mem_notifier(struct notifier_block *nb,
3293			     unsigned long action, void *data)
3294{
3295	unsigned long tmp, free_kbytes;
3296
3297	switch (action) {
3298	case MEM_ONLINE:
3299		/* Default max is 128MB. Leave alone if modified by operator. */
3300		tmp = sysctl_user_reserve_kbytes;
3301		if (0 < tmp && tmp < (1UL << 17))
3302			init_user_reserve();
3303
3304		/* Default max is 8MB.  Leave alone if modified by operator. */
3305		tmp = sysctl_admin_reserve_kbytes;
3306		if (0 < tmp && tmp < (1UL << 13))
3307			init_admin_reserve();
3308
3309		break;
3310	case MEM_OFFLINE:
3311		free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
3312
3313		if (sysctl_user_reserve_kbytes > free_kbytes) {
3314			init_user_reserve();
3315			pr_info("vm.user_reserve_kbytes reset to %lu\n",
3316				sysctl_user_reserve_kbytes);
3317		}
3318
3319		if (sysctl_admin_reserve_kbytes > free_kbytes) {
3320			init_admin_reserve();
3321			pr_info("vm.admin_reserve_kbytes reset to %lu\n",
3322				sysctl_admin_reserve_kbytes);
3323		}
3324		break;
3325	default:
3326		break;
3327	}
3328	return NOTIFY_OK;
3329}
3330
3331static struct notifier_block reserve_mem_nb = {
3332	.notifier_call = reserve_mem_notifier,
3333};
3334
3335static int __meminit init_reserve_notifier(void)
3336{
3337	if (register_hotmemory_notifier(&reserve_mem_nb))
3338		pr_err("Failed registering memory add/remove notifier for admin reserve\n");
3339
3340	return 0;
3341}
3342subsys_initcall(init_reserve_notifier);
3343