mempolicy.c revision d98f6cb67fb5b9376d4957d7ba9f32eac35c2e08
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave     Allocate memory interleaved over a set of nodes,
16 *                with normal fallback if it fails.
17 *                For VMA based allocations this interleaves based on the
18 *                offset into the backing object or offset into the mapping
19 *                for anonymous memory. For process policy an process counter
20 *                is used.
21 *
22 * bind           Only allocate memory on a specific set of nodes,
23 *                no fallback.
24 *                FIXME: memory is allocated starting with the first node
25 *                to the last. It would be better if bind would truly restrict
26 *                the allocation to memory nodes instead
27 *
28 * preferred       Try a specific node first before normal fallback.
29 *                As a special case node -1 here means do the allocation
30 *                on the local CPU. This is normally identical to default,
31 *                but useful to set in a VMA when you have a non default
32 *                process policy.
33 *
34 * default        Allocate on the local node first, or when on a VMA
35 *                use the process policy. This is what Linux always did
36 *		  in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57   fix mmap readahead to honour policy and enable policy for any page cache
58   object
59   statistics for bigpages
60   global policy for page cache? currently it uses process policy. Requires
61   first item above.
62   handle mremap for shared memory (currently ignored for the policy)
63   grows down?
64   make bind policy root only? It can trigger oom much faster and the
65   kernel is not always grateful with that.
66*/
67
68#include <linux/mempolicy.h>
69#include <linux/mm.h>
70#include <linux/highmem.h>
71#include <linux/hugetlb.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
74#include <linux/nodemask.h>
75#include <linux/cpuset.h>
76#include <linux/slab.h>
77#include <linux/string.h>
78#include <linux/module.h>
79#include <linux/nsproxy.h>
80#include <linux/interrupt.h>
81#include <linux/init.h>
82#include <linux/compat.h>
83#include <linux/swap.h>
84#include <linux/seq_file.h>
85#include <linux/proc_fs.h>
86#include <linux/migrate.h>
87#include <linux/ksm.h>
88#include <linux/rmap.h>
89#include <linux/security.h>
90#include <linux/syscalls.h>
91#include <linux/ctype.h>
92#include <linux/mm_inline.h>
93
94#include <asm/tlbflush.h>
95#include <asm/uaccess.h>
96
97#include "internal.h"
98
99/* Internal flags */
100#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
101#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
102#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
103
104static struct kmem_cache *policy_cache;
105static struct kmem_cache *sn_cache;
106
107/* Highest zone. An specific allocation for a zone below that is not
108   policied. */
109enum zone_type policy_zone = 0;
110
111/*
112 * run-time system-wide default policy => local allocation
113 */
114struct mempolicy default_policy = {
115	.refcnt = ATOMIC_INIT(1), /* never free it */
116	.mode = MPOL_PREFERRED,
117	.flags = MPOL_F_LOCAL,
118};
119
120static const struct mempolicy_operations {
121	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
122	/*
123	 * If read-side task has no lock to protect task->mempolicy, write-side
124	 * task will rebind the task->mempolicy by two step. The first step is
125	 * setting all the newly nodes, and the second step is cleaning all the
126	 * disallowed nodes. In this way, we can avoid finding no node to alloc
127	 * page.
128	 * If we have a lock to protect task->mempolicy in read-side, we do
129	 * rebind directly.
130	 *
131	 * step:
132	 * 	MPOL_REBIND_ONCE - do rebind work at once
133	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
134	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
135	 */
136	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
137			enum mpol_rebind_step step);
138} mpol_ops[MPOL_MAX];
139
140/* Check that the nodemask contains at least one populated zone */
141static int is_valid_nodemask(const nodemask_t *nodemask)
142{
143	int nd, k;
144
145	for_each_node_mask(nd, *nodemask) {
146		struct zone *z;
147
148		for (k = 0; k <= policy_zone; k++) {
149			z = &NODE_DATA(nd)->node_zones[k];
150			if (z->present_pages > 0)
151				return 1;
152		}
153	}
154
155	return 0;
156}
157
158static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
159{
160	return pol->flags & MPOL_MODE_FLAGS;
161}
162
163static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
164				   const nodemask_t *rel)
165{
166	nodemask_t tmp;
167	nodes_fold(tmp, *orig, nodes_weight(*rel));
168	nodes_onto(*ret, tmp, *rel);
169}
170
171static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
172{
173	if (nodes_empty(*nodes))
174		return -EINVAL;
175	pol->v.nodes = *nodes;
176	return 0;
177}
178
179static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
180{
181	if (!nodes)
182		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
183	else if (nodes_empty(*nodes))
184		return -EINVAL;			/*  no allowed nodes */
185	else
186		pol->v.preferred_node = first_node(*nodes);
187	return 0;
188}
189
190static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
191{
192	if (!is_valid_nodemask(nodes))
193		return -EINVAL;
194	pol->v.nodes = *nodes;
195	return 0;
196}
197
198/*
199 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
200 * any, for the new policy.  mpol_new() has already validated the nodes
201 * parameter with respect to the policy mode and flags.  But, we need to
202 * handle an empty nodemask with MPOL_PREFERRED here.
203 *
204 * Must be called holding task's alloc_lock to protect task's mems_allowed
205 * and mempolicy.  May also be called holding the mmap_semaphore for write.
206 */
207static int mpol_set_nodemask(struct mempolicy *pol,
208		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
209{
210	int ret;
211
212	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
213	if (pol == NULL)
214		return 0;
215	/* Check N_HIGH_MEMORY */
216	nodes_and(nsc->mask1,
217		  cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
218
219	VM_BUG_ON(!nodes);
220	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
221		nodes = NULL;	/* explicit local allocation */
222	else {
223		if (pol->flags & MPOL_F_RELATIVE_NODES)
224			mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
225		else
226			nodes_and(nsc->mask2, *nodes, nsc->mask1);
227
228		if (mpol_store_user_nodemask(pol))
229			pol->w.user_nodemask = *nodes;
230		else
231			pol->w.cpuset_mems_allowed =
232						cpuset_current_mems_allowed;
233	}
234
235	if (nodes)
236		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
237	else
238		ret = mpol_ops[pol->mode].create(pol, NULL);
239	return ret;
240}
241
242/*
243 * This function just creates a new policy, does some check and simple
244 * initialization. You must invoke mpol_set_nodemask() to set nodes.
245 */
246static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
247				  nodemask_t *nodes)
248{
249	struct mempolicy *policy;
250
251	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
252		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
253
254	if (mode == MPOL_DEFAULT) {
255		if (nodes && !nodes_empty(*nodes))
256			return ERR_PTR(-EINVAL);
257		return NULL;	/* simply delete any existing policy */
258	}
259	VM_BUG_ON(!nodes);
260
261	/*
262	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
263	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
264	 * All other modes require a valid pointer to a non-empty nodemask.
265	 */
266	if (mode == MPOL_PREFERRED) {
267		if (nodes_empty(*nodes)) {
268			if (((flags & MPOL_F_STATIC_NODES) ||
269			     (flags & MPOL_F_RELATIVE_NODES)))
270				return ERR_PTR(-EINVAL);
271		}
272	} else if (nodes_empty(*nodes))
273		return ERR_PTR(-EINVAL);
274	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
275	if (!policy)
276		return ERR_PTR(-ENOMEM);
277	atomic_set(&policy->refcnt, 1);
278	policy->mode = mode;
279	policy->flags = flags;
280
281	return policy;
282}
283
284/* Slow path of a mpol destructor. */
285void __mpol_put(struct mempolicy *p)
286{
287	if (!atomic_dec_and_test(&p->refcnt))
288		return;
289	kmem_cache_free(policy_cache, p);
290}
291
292static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
293				enum mpol_rebind_step step)
294{
295}
296
297/*
298 * step:
299 * 	MPOL_REBIND_ONCE  - do rebind work at once
300 * 	MPOL_REBIND_STEP1 - set all the newly nodes
301 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
302 */
303static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
304				 enum mpol_rebind_step step)
305{
306	nodemask_t tmp;
307
308	if (pol->flags & MPOL_F_STATIC_NODES)
309		nodes_and(tmp, pol->w.user_nodemask, *nodes);
310	else if (pol->flags & MPOL_F_RELATIVE_NODES)
311		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
312	else {
313		/*
314		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
315		 * result
316		 */
317		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
318			nodes_remap(tmp, pol->v.nodes,
319					pol->w.cpuset_mems_allowed, *nodes);
320			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
321		} else if (step == MPOL_REBIND_STEP2) {
322			tmp = pol->w.cpuset_mems_allowed;
323			pol->w.cpuset_mems_allowed = *nodes;
324		} else
325			BUG();
326	}
327
328	if (nodes_empty(tmp))
329		tmp = *nodes;
330
331	if (step == MPOL_REBIND_STEP1)
332		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
333	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
334		pol->v.nodes = tmp;
335	else
336		BUG();
337
338	if (!node_isset(current->il_next, tmp)) {
339		current->il_next = next_node(current->il_next, tmp);
340		if (current->il_next >= MAX_NUMNODES)
341			current->il_next = first_node(tmp);
342		if (current->il_next >= MAX_NUMNODES)
343			current->il_next = numa_node_id();
344	}
345}
346
347static void mpol_rebind_preferred(struct mempolicy *pol,
348				  const nodemask_t *nodes,
349				  enum mpol_rebind_step step)
350{
351	nodemask_t tmp;
352
353	if (pol->flags & MPOL_F_STATIC_NODES) {
354		int node = first_node(pol->w.user_nodemask);
355
356		if (node_isset(node, *nodes)) {
357			pol->v.preferred_node = node;
358			pol->flags &= ~MPOL_F_LOCAL;
359		} else
360			pol->flags |= MPOL_F_LOCAL;
361	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
362		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
363		pol->v.preferred_node = first_node(tmp);
364	} else if (!(pol->flags & MPOL_F_LOCAL)) {
365		pol->v.preferred_node = node_remap(pol->v.preferred_node,
366						   pol->w.cpuset_mems_allowed,
367						   *nodes);
368		pol->w.cpuset_mems_allowed = *nodes;
369	}
370}
371
372/*
373 * mpol_rebind_policy - Migrate a policy to a different set of nodes
374 *
375 * If read-side task has no lock to protect task->mempolicy, write-side
376 * task will rebind the task->mempolicy by two step. The first step is
377 * setting all the newly nodes, and the second step is cleaning all the
378 * disallowed nodes. In this way, we can avoid finding no node to alloc
379 * page.
380 * If we have a lock to protect task->mempolicy in read-side, we do
381 * rebind directly.
382 *
383 * step:
384 * 	MPOL_REBIND_ONCE  - do rebind work at once
385 * 	MPOL_REBIND_STEP1 - set all the newly nodes
386 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
387 */
388static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
389				enum mpol_rebind_step step)
390{
391	if (!pol)
392		return;
393	if (!mpol_store_user_nodemask(pol) && step == 0 &&
394	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
395		return;
396
397	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
398		return;
399
400	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
401		BUG();
402
403	if (step == MPOL_REBIND_STEP1)
404		pol->flags |= MPOL_F_REBINDING;
405	else if (step == MPOL_REBIND_STEP2)
406		pol->flags &= ~MPOL_F_REBINDING;
407	else if (step >= MPOL_REBIND_NSTEP)
408		BUG();
409
410	mpol_ops[pol->mode].rebind(pol, newmask, step);
411}
412
413/*
414 * Wrapper for mpol_rebind_policy() that just requires task
415 * pointer, and updates task mempolicy.
416 *
417 * Called with task's alloc_lock held.
418 */
419
420void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
421			enum mpol_rebind_step step)
422{
423	mpol_rebind_policy(tsk->mempolicy, new, step);
424}
425
426/*
427 * Rebind each vma in mm to new nodemask.
428 *
429 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
430 */
431
432void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
433{
434	struct vm_area_struct *vma;
435
436	down_write(&mm->mmap_sem);
437	for (vma = mm->mmap; vma; vma = vma->vm_next)
438		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
439	up_write(&mm->mmap_sem);
440}
441
442static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
443	[MPOL_DEFAULT] = {
444		.rebind = mpol_rebind_default,
445	},
446	[MPOL_INTERLEAVE] = {
447		.create = mpol_new_interleave,
448		.rebind = mpol_rebind_nodemask,
449	},
450	[MPOL_PREFERRED] = {
451		.create = mpol_new_preferred,
452		.rebind = mpol_rebind_preferred,
453	},
454	[MPOL_BIND] = {
455		.create = mpol_new_bind,
456		.rebind = mpol_rebind_nodemask,
457	},
458};
459
460static void gather_stats(struct page *, void *, int pte_dirty);
461static void migrate_page_add(struct page *page, struct list_head *pagelist,
462				unsigned long flags);
463
464/* Scan through pages checking if pages follow certain conditions. */
465static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
466		unsigned long addr, unsigned long end,
467		const nodemask_t *nodes, unsigned long flags,
468		void *private)
469{
470	pte_t *orig_pte;
471	pte_t *pte;
472	spinlock_t *ptl;
473
474	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
475	do {
476		struct page *page;
477		int nid;
478
479		if (!pte_present(*pte))
480			continue;
481		page = vm_normal_page(vma, addr, *pte);
482		if (!page)
483			continue;
484		/*
485		 * vm_normal_page() filters out zero pages, but there might
486		 * still be PageReserved pages to skip, perhaps in a VDSO.
487		 * And we cannot move PageKsm pages sensibly or safely yet.
488		 */
489		if (PageReserved(page) || PageKsm(page))
490			continue;
491		nid = page_to_nid(page);
492		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
493			continue;
494
495		if (flags & MPOL_MF_STATS)
496			gather_stats(page, private, pte_dirty(*pte));
497		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
498			migrate_page_add(page, private, flags);
499		else
500			break;
501	} while (pte++, addr += PAGE_SIZE, addr != end);
502	pte_unmap_unlock(orig_pte, ptl);
503	return addr != end;
504}
505
506static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
507		unsigned long addr, unsigned long end,
508		const nodemask_t *nodes, unsigned long flags,
509		void *private)
510{
511	pmd_t *pmd;
512	unsigned long next;
513
514	pmd = pmd_offset(pud, addr);
515	do {
516		next = pmd_addr_end(addr, end);
517		split_huge_page_pmd(vma->vm_mm, pmd);
518		if (pmd_none_or_clear_bad(pmd))
519			continue;
520		if (check_pte_range(vma, pmd, addr, next, nodes,
521				    flags, private))
522			return -EIO;
523	} while (pmd++, addr = next, addr != end);
524	return 0;
525}
526
527static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
528		unsigned long addr, unsigned long end,
529		const nodemask_t *nodes, unsigned long flags,
530		void *private)
531{
532	pud_t *pud;
533	unsigned long next;
534
535	pud = pud_offset(pgd, addr);
536	do {
537		next = pud_addr_end(addr, end);
538		if (pud_none_or_clear_bad(pud))
539			continue;
540		if (check_pmd_range(vma, pud, addr, next, nodes,
541				    flags, private))
542			return -EIO;
543	} while (pud++, addr = next, addr != end);
544	return 0;
545}
546
547static inline int check_pgd_range(struct vm_area_struct *vma,
548		unsigned long addr, unsigned long end,
549		const nodemask_t *nodes, unsigned long flags,
550		void *private)
551{
552	pgd_t *pgd;
553	unsigned long next;
554
555	pgd = pgd_offset(vma->vm_mm, addr);
556	do {
557		next = pgd_addr_end(addr, end);
558		if (pgd_none_or_clear_bad(pgd))
559			continue;
560		if (check_pud_range(vma, pgd, addr, next, nodes,
561				    flags, private))
562			return -EIO;
563	} while (pgd++, addr = next, addr != end);
564	return 0;
565}
566
567/*
568 * Check if all pages in a range are on a set of nodes.
569 * If pagelist != NULL then isolate pages from the LRU and
570 * put them on the pagelist.
571 */
572static struct vm_area_struct *
573check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
574		const nodemask_t *nodes, unsigned long flags, void *private)
575{
576	int err;
577	struct vm_area_struct *first, *vma, *prev;
578
579
580	first = find_vma(mm, start);
581	if (!first)
582		return ERR_PTR(-EFAULT);
583	prev = NULL;
584	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
585		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
586			if (!vma->vm_next && vma->vm_end < end)
587				return ERR_PTR(-EFAULT);
588			if (prev && prev->vm_end < vma->vm_start)
589				return ERR_PTR(-EFAULT);
590		}
591		if (!is_vm_hugetlb_page(vma) &&
592		    ((flags & MPOL_MF_STRICT) ||
593		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
594				vma_migratable(vma)))) {
595			unsigned long endvma = vma->vm_end;
596
597			if (endvma > end)
598				endvma = end;
599			if (vma->vm_start > start)
600				start = vma->vm_start;
601			err = check_pgd_range(vma, start, endvma, nodes,
602						flags, private);
603			if (err) {
604				first = ERR_PTR(err);
605				break;
606			}
607		}
608		prev = vma;
609	}
610	return first;
611}
612
613/* Apply policy to a single VMA */
614static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
615{
616	int err = 0;
617	struct mempolicy *old = vma->vm_policy;
618
619	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
620		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
621		 vma->vm_ops, vma->vm_file,
622		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
623
624	if (vma->vm_ops && vma->vm_ops->set_policy)
625		err = vma->vm_ops->set_policy(vma, new);
626	if (!err) {
627		mpol_get(new);
628		vma->vm_policy = new;
629		mpol_put(old);
630	}
631	return err;
632}
633
634/* Step 2: apply policy to a range and do splits. */
635static int mbind_range(struct mm_struct *mm, unsigned long start,
636		       unsigned long end, struct mempolicy *new_pol)
637{
638	struct vm_area_struct *next;
639	struct vm_area_struct *prev;
640	struct vm_area_struct *vma;
641	int err = 0;
642	pgoff_t pgoff;
643	unsigned long vmstart;
644	unsigned long vmend;
645
646	vma = find_vma_prev(mm, start, &prev);
647	if (!vma || vma->vm_start > start)
648		return -EFAULT;
649
650	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
651		next = vma->vm_next;
652		vmstart = max(start, vma->vm_start);
653		vmend   = min(end, vma->vm_end);
654
655		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
656		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
657				  vma->anon_vma, vma->vm_file, pgoff, new_pol);
658		if (prev) {
659			vma = prev;
660			next = vma->vm_next;
661			continue;
662		}
663		if (vma->vm_start != vmstart) {
664			err = split_vma(vma->vm_mm, vma, vmstart, 1);
665			if (err)
666				goto out;
667		}
668		if (vma->vm_end != vmend) {
669			err = split_vma(vma->vm_mm, vma, vmend, 0);
670			if (err)
671				goto out;
672		}
673		err = policy_vma(vma, new_pol);
674		if (err)
675			goto out;
676	}
677
678 out:
679	return err;
680}
681
682/*
683 * Update task->flags PF_MEMPOLICY bit: set iff non-default
684 * mempolicy.  Allows more rapid checking of this (combined perhaps
685 * with other PF_* flag bits) on memory allocation hot code paths.
686 *
687 * If called from outside this file, the task 'p' should -only- be
688 * a newly forked child not yet visible on the task list, because
689 * manipulating the task flags of a visible task is not safe.
690 *
691 * The above limitation is why this routine has the funny name
692 * mpol_fix_fork_child_flag().
693 *
694 * It is also safe to call this with a task pointer of current,
695 * which the static wrapper mpol_set_task_struct_flag() does,
696 * for use within this file.
697 */
698
699void mpol_fix_fork_child_flag(struct task_struct *p)
700{
701	if (p->mempolicy)
702		p->flags |= PF_MEMPOLICY;
703	else
704		p->flags &= ~PF_MEMPOLICY;
705}
706
707static void mpol_set_task_struct_flag(void)
708{
709	mpol_fix_fork_child_flag(current);
710}
711
712/* Set the process memory policy */
713static long do_set_mempolicy(unsigned short mode, unsigned short flags,
714			     nodemask_t *nodes)
715{
716	struct mempolicy *new, *old;
717	struct mm_struct *mm = current->mm;
718	NODEMASK_SCRATCH(scratch);
719	int ret;
720
721	if (!scratch)
722		return -ENOMEM;
723
724	new = mpol_new(mode, flags, nodes);
725	if (IS_ERR(new)) {
726		ret = PTR_ERR(new);
727		goto out;
728	}
729	/*
730	 * prevent changing our mempolicy while show_numa_maps()
731	 * is using it.
732	 * Note:  do_set_mempolicy() can be called at init time
733	 * with no 'mm'.
734	 */
735	if (mm)
736		down_write(&mm->mmap_sem);
737	task_lock(current);
738	ret = mpol_set_nodemask(new, nodes, scratch);
739	if (ret) {
740		task_unlock(current);
741		if (mm)
742			up_write(&mm->mmap_sem);
743		mpol_put(new);
744		goto out;
745	}
746	old = current->mempolicy;
747	current->mempolicy = new;
748	mpol_set_task_struct_flag();
749	if (new && new->mode == MPOL_INTERLEAVE &&
750	    nodes_weight(new->v.nodes))
751		current->il_next = first_node(new->v.nodes);
752	task_unlock(current);
753	if (mm)
754		up_write(&mm->mmap_sem);
755
756	mpol_put(old);
757	ret = 0;
758out:
759	NODEMASK_SCRATCH_FREE(scratch);
760	return ret;
761}
762
763/*
764 * Return nodemask for policy for get_mempolicy() query
765 *
766 * Called with task's alloc_lock held
767 */
768static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
769{
770	nodes_clear(*nodes);
771	if (p == &default_policy)
772		return;
773
774	switch (p->mode) {
775	case MPOL_BIND:
776		/* Fall through */
777	case MPOL_INTERLEAVE:
778		*nodes = p->v.nodes;
779		break;
780	case MPOL_PREFERRED:
781		if (!(p->flags & MPOL_F_LOCAL))
782			node_set(p->v.preferred_node, *nodes);
783		/* else return empty node mask for local allocation */
784		break;
785	default:
786		BUG();
787	}
788}
789
790static int lookup_node(struct mm_struct *mm, unsigned long addr)
791{
792	struct page *p;
793	int err;
794
795	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
796	if (err >= 0) {
797		err = page_to_nid(p);
798		put_page(p);
799	}
800	return err;
801}
802
803/* Retrieve NUMA policy */
804static long do_get_mempolicy(int *policy, nodemask_t *nmask,
805			     unsigned long addr, unsigned long flags)
806{
807	int err;
808	struct mm_struct *mm = current->mm;
809	struct vm_area_struct *vma = NULL;
810	struct mempolicy *pol = current->mempolicy;
811
812	if (flags &
813		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
814		return -EINVAL;
815
816	if (flags & MPOL_F_MEMS_ALLOWED) {
817		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
818			return -EINVAL;
819		*policy = 0;	/* just so it's initialized */
820		task_lock(current);
821		*nmask  = cpuset_current_mems_allowed;
822		task_unlock(current);
823		return 0;
824	}
825
826	if (flags & MPOL_F_ADDR) {
827		/*
828		 * Do NOT fall back to task policy if the
829		 * vma/shared policy at addr is NULL.  We
830		 * want to return MPOL_DEFAULT in this case.
831		 */
832		down_read(&mm->mmap_sem);
833		vma = find_vma_intersection(mm, addr, addr+1);
834		if (!vma) {
835			up_read(&mm->mmap_sem);
836			return -EFAULT;
837		}
838		if (vma->vm_ops && vma->vm_ops->get_policy)
839			pol = vma->vm_ops->get_policy(vma, addr);
840		else
841			pol = vma->vm_policy;
842	} else if (addr)
843		return -EINVAL;
844
845	if (!pol)
846		pol = &default_policy;	/* indicates default behavior */
847
848	if (flags & MPOL_F_NODE) {
849		if (flags & MPOL_F_ADDR) {
850			err = lookup_node(mm, addr);
851			if (err < 0)
852				goto out;
853			*policy = err;
854		} else if (pol == current->mempolicy &&
855				pol->mode == MPOL_INTERLEAVE) {
856			*policy = current->il_next;
857		} else {
858			err = -EINVAL;
859			goto out;
860		}
861	} else {
862		*policy = pol == &default_policy ? MPOL_DEFAULT :
863						pol->mode;
864		/*
865		 * Internal mempolicy flags must be masked off before exposing
866		 * the policy to userspace.
867		 */
868		*policy |= (pol->flags & MPOL_MODE_FLAGS);
869	}
870
871	if (vma) {
872		up_read(&current->mm->mmap_sem);
873		vma = NULL;
874	}
875
876	err = 0;
877	if (nmask) {
878		if (mpol_store_user_nodemask(pol)) {
879			*nmask = pol->w.user_nodemask;
880		} else {
881			task_lock(current);
882			get_policy_nodemask(pol, nmask);
883			task_unlock(current);
884		}
885	}
886
887 out:
888	mpol_cond_put(pol);
889	if (vma)
890		up_read(&current->mm->mmap_sem);
891	return err;
892}
893
894#ifdef CONFIG_MIGRATION
895/*
896 * page migration
897 */
898static void migrate_page_add(struct page *page, struct list_head *pagelist,
899				unsigned long flags)
900{
901	/*
902	 * Avoid migrating a page that is shared with others.
903	 */
904	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
905		if (!isolate_lru_page(page)) {
906			list_add_tail(&page->lru, pagelist);
907			inc_zone_page_state(page, NR_ISOLATED_ANON +
908					    page_is_file_cache(page));
909		}
910	}
911}
912
913static struct page *new_node_page(struct page *page, unsigned long node, int **x)
914{
915	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
916}
917
918/*
919 * Migrate pages from one node to a target node.
920 * Returns error or the number of pages not migrated.
921 */
922static int migrate_to_node(struct mm_struct *mm, int source, int dest,
923			   int flags)
924{
925	nodemask_t nmask;
926	LIST_HEAD(pagelist);
927	int err = 0;
928	struct vm_area_struct *vma;
929
930	nodes_clear(nmask);
931	node_set(source, nmask);
932
933	vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
934			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
935	if (IS_ERR(vma))
936		return PTR_ERR(vma);
937
938	if (!list_empty(&pagelist)) {
939		err = migrate_pages(&pagelist, new_node_page, dest,
940								false, true);
941		if (err)
942			putback_lru_pages(&pagelist);
943	}
944
945	return err;
946}
947
948/*
949 * Move pages between the two nodesets so as to preserve the physical
950 * layout as much as possible.
951 *
952 * Returns the number of page that could not be moved.
953 */
954int do_migrate_pages(struct mm_struct *mm,
955	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
956{
957	int busy = 0;
958	int err;
959	nodemask_t tmp;
960
961	err = migrate_prep();
962	if (err)
963		return err;
964
965	down_read(&mm->mmap_sem);
966
967	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
968	if (err)
969		goto out;
970
971	/*
972	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
973	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
974	 * bit in 'tmp', and return that <source, dest> pair for migration.
975	 * The pair of nodemasks 'to' and 'from' define the map.
976	 *
977	 * If no pair of bits is found that way, fallback to picking some
978	 * pair of 'source' and 'dest' bits that are not the same.  If the
979	 * 'source' and 'dest' bits are the same, this represents a node
980	 * that will be migrating to itself, so no pages need move.
981	 *
982	 * If no bits are left in 'tmp', or if all remaining bits left
983	 * in 'tmp' correspond to the same bit in 'to', return false
984	 * (nothing left to migrate).
985	 *
986	 * This lets us pick a pair of nodes to migrate between, such that
987	 * if possible the dest node is not already occupied by some other
988	 * source node, minimizing the risk of overloading the memory on a
989	 * node that would happen if we migrated incoming memory to a node
990	 * before migrating outgoing memory source that same node.
991	 *
992	 * A single scan of tmp is sufficient.  As we go, we remember the
993	 * most recent <s, d> pair that moved (s != d).  If we find a pair
994	 * that not only moved, but what's better, moved to an empty slot
995	 * (d is not set in tmp), then we break out then, with that pair.
996	 * Otherwise when we finish scanning from_tmp, we at least have the
997	 * most recent <s, d> pair that moved.  If we get all the way through
998	 * the scan of tmp without finding any node that moved, much less
999	 * moved to an empty node, then there is nothing left worth migrating.
1000	 */
1001
1002	tmp = *from_nodes;
1003	while (!nodes_empty(tmp)) {
1004		int s,d;
1005		int source = -1;
1006		int dest = 0;
1007
1008		for_each_node_mask(s, tmp) {
1009			d = node_remap(s, *from_nodes, *to_nodes);
1010			if (s == d)
1011				continue;
1012
1013			source = s;	/* Node moved. Memorize */
1014			dest = d;
1015
1016			/* dest not in remaining from nodes? */
1017			if (!node_isset(dest, tmp))
1018				break;
1019		}
1020		if (source == -1)
1021			break;
1022
1023		node_clear(source, tmp);
1024		err = migrate_to_node(mm, source, dest, flags);
1025		if (err > 0)
1026			busy += err;
1027		if (err < 0)
1028			break;
1029	}
1030out:
1031	up_read(&mm->mmap_sem);
1032	if (err < 0)
1033		return err;
1034	return busy;
1035
1036}
1037
1038/*
1039 * Allocate a new page for page migration based on vma policy.
1040 * Start assuming that page is mapped by vma pointed to by @private.
1041 * Search forward from there, if not.  N.B., this assumes that the
1042 * list of pages handed to migrate_pages()--which is how we get here--
1043 * is in virtual address order.
1044 */
1045static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1046{
1047	struct vm_area_struct *vma = (struct vm_area_struct *)private;
1048	unsigned long uninitialized_var(address);
1049
1050	while (vma) {
1051		address = page_address_in_vma(page, vma);
1052		if (address != -EFAULT)
1053			break;
1054		vma = vma->vm_next;
1055	}
1056
1057	/*
1058	 * if !vma, alloc_page_vma() will use task or system default policy
1059	 */
1060	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1061}
1062#else
1063
1064static void migrate_page_add(struct page *page, struct list_head *pagelist,
1065				unsigned long flags)
1066{
1067}
1068
1069int do_migrate_pages(struct mm_struct *mm,
1070	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
1071{
1072	return -ENOSYS;
1073}
1074
1075static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1076{
1077	return NULL;
1078}
1079#endif
1080
1081static long do_mbind(unsigned long start, unsigned long len,
1082		     unsigned short mode, unsigned short mode_flags,
1083		     nodemask_t *nmask, unsigned long flags)
1084{
1085	struct vm_area_struct *vma;
1086	struct mm_struct *mm = current->mm;
1087	struct mempolicy *new;
1088	unsigned long end;
1089	int err;
1090	LIST_HEAD(pagelist);
1091
1092	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
1093				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1094		return -EINVAL;
1095	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1096		return -EPERM;
1097
1098	if (start & ~PAGE_MASK)
1099		return -EINVAL;
1100
1101	if (mode == MPOL_DEFAULT)
1102		flags &= ~MPOL_MF_STRICT;
1103
1104	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1105	end = start + len;
1106
1107	if (end < start)
1108		return -EINVAL;
1109	if (end == start)
1110		return 0;
1111
1112	new = mpol_new(mode, mode_flags, nmask);
1113	if (IS_ERR(new))
1114		return PTR_ERR(new);
1115
1116	/*
1117	 * If we are using the default policy then operation
1118	 * on discontinuous address spaces is okay after all
1119	 */
1120	if (!new)
1121		flags |= MPOL_MF_DISCONTIG_OK;
1122
1123	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1124		 start, start + len, mode, mode_flags,
1125		 nmask ? nodes_addr(*nmask)[0] : -1);
1126
1127	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1128
1129		err = migrate_prep();
1130		if (err)
1131			goto mpol_out;
1132	}
1133	{
1134		NODEMASK_SCRATCH(scratch);
1135		if (scratch) {
1136			down_write(&mm->mmap_sem);
1137			task_lock(current);
1138			err = mpol_set_nodemask(new, nmask, scratch);
1139			task_unlock(current);
1140			if (err)
1141				up_write(&mm->mmap_sem);
1142		} else
1143			err = -ENOMEM;
1144		NODEMASK_SCRATCH_FREE(scratch);
1145	}
1146	if (err)
1147		goto mpol_out;
1148
1149	vma = check_range(mm, start, end, nmask,
1150			  flags | MPOL_MF_INVERT, &pagelist);
1151
1152	err = PTR_ERR(vma);
1153	if (!IS_ERR(vma)) {
1154		int nr_failed = 0;
1155
1156		err = mbind_range(mm, start, end, new);
1157
1158		if (!list_empty(&pagelist)) {
1159			nr_failed = migrate_pages(&pagelist, new_vma_page,
1160						(unsigned long)vma,
1161						false, true);
1162			if (nr_failed)
1163				putback_lru_pages(&pagelist);
1164		}
1165
1166		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1167			err = -EIO;
1168	} else
1169		putback_lru_pages(&pagelist);
1170
1171	up_write(&mm->mmap_sem);
1172 mpol_out:
1173	mpol_put(new);
1174	return err;
1175}
1176
1177/*
1178 * User space interface with variable sized bitmaps for nodelists.
1179 */
1180
1181/* Copy a node mask from user space. */
1182static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1183		     unsigned long maxnode)
1184{
1185	unsigned long k;
1186	unsigned long nlongs;
1187	unsigned long endmask;
1188
1189	--maxnode;
1190	nodes_clear(*nodes);
1191	if (maxnode == 0 || !nmask)
1192		return 0;
1193	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1194		return -EINVAL;
1195
1196	nlongs = BITS_TO_LONGS(maxnode);
1197	if ((maxnode % BITS_PER_LONG) == 0)
1198		endmask = ~0UL;
1199	else
1200		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1201
1202	/* When the user specified more nodes than supported just check
1203	   if the non supported part is all zero. */
1204	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1205		if (nlongs > PAGE_SIZE/sizeof(long))
1206			return -EINVAL;
1207		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1208			unsigned long t;
1209			if (get_user(t, nmask + k))
1210				return -EFAULT;
1211			if (k == nlongs - 1) {
1212				if (t & endmask)
1213					return -EINVAL;
1214			} else if (t)
1215				return -EINVAL;
1216		}
1217		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1218		endmask = ~0UL;
1219	}
1220
1221	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1222		return -EFAULT;
1223	nodes_addr(*nodes)[nlongs-1] &= endmask;
1224	return 0;
1225}
1226
1227/* Copy a kernel node mask to user space */
1228static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1229			      nodemask_t *nodes)
1230{
1231	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1232	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1233
1234	if (copy > nbytes) {
1235		if (copy > PAGE_SIZE)
1236			return -EINVAL;
1237		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1238			return -EFAULT;
1239		copy = nbytes;
1240	}
1241	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1242}
1243
1244SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1245		unsigned long, mode, unsigned long __user *, nmask,
1246		unsigned long, maxnode, unsigned, flags)
1247{
1248	nodemask_t nodes;
1249	int err;
1250	unsigned short mode_flags;
1251
1252	mode_flags = mode & MPOL_MODE_FLAGS;
1253	mode &= ~MPOL_MODE_FLAGS;
1254	if (mode >= MPOL_MAX)
1255		return -EINVAL;
1256	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1257	    (mode_flags & MPOL_F_RELATIVE_NODES))
1258		return -EINVAL;
1259	err = get_nodes(&nodes, nmask, maxnode);
1260	if (err)
1261		return err;
1262	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1263}
1264
1265/* Set the process memory policy */
1266SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1267		unsigned long, maxnode)
1268{
1269	int err;
1270	nodemask_t nodes;
1271	unsigned short flags;
1272
1273	flags = mode & MPOL_MODE_FLAGS;
1274	mode &= ~MPOL_MODE_FLAGS;
1275	if ((unsigned int)mode >= MPOL_MAX)
1276		return -EINVAL;
1277	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1278		return -EINVAL;
1279	err = get_nodes(&nodes, nmask, maxnode);
1280	if (err)
1281		return err;
1282	return do_set_mempolicy(mode, flags, &nodes);
1283}
1284
1285SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1286		const unsigned long __user *, old_nodes,
1287		const unsigned long __user *, new_nodes)
1288{
1289	const struct cred *cred = current_cred(), *tcred;
1290	struct mm_struct *mm = NULL;
1291	struct task_struct *task;
1292	nodemask_t task_nodes;
1293	int err;
1294	nodemask_t *old;
1295	nodemask_t *new;
1296	NODEMASK_SCRATCH(scratch);
1297
1298	if (!scratch)
1299		return -ENOMEM;
1300
1301	old = &scratch->mask1;
1302	new = &scratch->mask2;
1303
1304	err = get_nodes(old, old_nodes, maxnode);
1305	if (err)
1306		goto out;
1307
1308	err = get_nodes(new, new_nodes, maxnode);
1309	if (err)
1310		goto out;
1311
1312	/* Find the mm_struct */
1313	rcu_read_lock();
1314	task = pid ? find_task_by_vpid(pid) : current;
1315	if (!task) {
1316		rcu_read_unlock();
1317		err = -ESRCH;
1318		goto out;
1319	}
1320	mm = get_task_mm(task);
1321	rcu_read_unlock();
1322
1323	err = -EINVAL;
1324	if (!mm)
1325		goto out;
1326
1327	/*
1328	 * Check if this process has the right to modify the specified
1329	 * process. The right exists if the process has administrative
1330	 * capabilities, superuser privileges or the same
1331	 * userid as the target process.
1332	 */
1333	rcu_read_lock();
1334	tcred = __task_cred(task);
1335	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1336	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
1337	    !capable(CAP_SYS_NICE)) {
1338		rcu_read_unlock();
1339		err = -EPERM;
1340		goto out;
1341	}
1342	rcu_read_unlock();
1343
1344	task_nodes = cpuset_mems_allowed(task);
1345	/* Is the user allowed to access the target nodes? */
1346	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1347		err = -EPERM;
1348		goto out;
1349	}
1350
1351	if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
1352		err = -EINVAL;
1353		goto out;
1354	}
1355
1356	err = security_task_movememory(task);
1357	if (err)
1358		goto out;
1359
1360	err = do_migrate_pages(mm, old, new,
1361		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1362out:
1363	if (mm)
1364		mmput(mm);
1365	NODEMASK_SCRATCH_FREE(scratch);
1366
1367	return err;
1368}
1369
1370
1371/* Retrieve NUMA policy */
1372SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1373		unsigned long __user *, nmask, unsigned long, maxnode,
1374		unsigned long, addr, unsigned long, flags)
1375{
1376	int err;
1377	int uninitialized_var(pval);
1378	nodemask_t nodes;
1379
1380	if (nmask != NULL && maxnode < MAX_NUMNODES)
1381		return -EINVAL;
1382
1383	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1384
1385	if (err)
1386		return err;
1387
1388	if (policy && put_user(pval, policy))
1389		return -EFAULT;
1390
1391	if (nmask)
1392		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1393
1394	return err;
1395}
1396
1397#ifdef CONFIG_COMPAT
1398
1399asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1400				     compat_ulong_t __user *nmask,
1401				     compat_ulong_t maxnode,
1402				     compat_ulong_t addr, compat_ulong_t flags)
1403{
1404	long err;
1405	unsigned long __user *nm = NULL;
1406	unsigned long nr_bits, alloc_size;
1407	DECLARE_BITMAP(bm, MAX_NUMNODES);
1408
1409	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1410	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1411
1412	if (nmask)
1413		nm = compat_alloc_user_space(alloc_size);
1414
1415	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1416
1417	if (!err && nmask) {
1418		err = copy_from_user(bm, nm, alloc_size);
1419		/* ensure entire bitmap is zeroed */
1420		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1421		err |= compat_put_bitmap(nmask, bm, nr_bits);
1422	}
1423
1424	return err;
1425}
1426
1427asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1428				     compat_ulong_t maxnode)
1429{
1430	long err = 0;
1431	unsigned long __user *nm = NULL;
1432	unsigned long nr_bits, alloc_size;
1433	DECLARE_BITMAP(bm, MAX_NUMNODES);
1434
1435	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1436	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1437
1438	if (nmask) {
1439		err = compat_get_bitmap(bm, nmask, nr_bits);
1440		nm = compat_alloc_user_space(alloc_size);
1441		err |= copy_to_user(nm, bm, alloc_size);
1442	}
1443
1444	if (err)
1445		return -EFAULT;
1446
1447	return sys_set_mempolicy(mode, nm, nr_bits+1);
1448}
1449
1450asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1451			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1452			     compat_ulong_t maxnode, compat_ulong_t flags)
1453{
1454	long err = 0;
1455	unsigned long __user *nm = NULL;
1456	unsigned long nr_bits, alloc_size;
1457	nodemask_t bm;
1458
1459	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1460	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1461
1462	if (nmask) {
1463		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1464		nm = compat_alloc_user_space(alloc_size);
1465		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1466	}
1467
1468	if (err)
1469		return -EFAULT;
1470
1471	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1472}
1473
1474#endif
1475
1476/*
1477 * get_vma_policy(@task, @vma, @addr)
1478 * @task - task for fallback if vma policy == default
1479 * @vma   - virtual memory area whose policy is sought
1480 * @addr  - address in @vma for shared policy lookup
1481 *
1482 * Returns effective policy for a VMA at specified address.
1483 * Falls back to @task or system default policy, as necessary.
1484 * Current or other task's task mempolicy and non-shared vma policies
1485 * are protected by the task's mmap_sem, which must be held for read by
1486 * the caller.
1487 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1488 * count--added by the get_policy() vm_op, as appropriate--to protect against
1489 * freeing by another task.  It is the caller's responsibility to free the
1490 * extra reference for shared policies.
1491 */
1492struct mempolicy *get_vma_policy(struct task_struct *task,
1493		struct vm_area_struct *vma, unsigned long addr)
1494{
1495	struct mempolicy *pol = task->mempolicy;
1496
1497	if (vma) {
1498		if (vma->vm_ops && vma->vm_ops->get_policy) {
1499			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1500									addr);
1501			if (vpol)
1502				pol = vpol;
1503		} else if (vma->vm_policy)
1504			pol = vma->vm_policy;
1505	}
1506	if (!pol)
1507		pol = &default_policy;
1508	return pol;
1509}
1510
1511/*
1512 * Return a nodemask representing a mempolicy for filtering nodes for
1513 * page allocation
1514 */
1515static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1516{
1517	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1518	if (unlikely(policy->mode == MPOL_BIND) &&
1519			gfp_zone(gfp) >= policy_zone &&
1520			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1521		return &policy->v.nodes;
1522
1523	return NULL;
1524}
1525
1526/* Return a zonelist indicated by gfp for node representing a mempolicy */
1527static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1528	int nd)
1529{
1530	switch (policy->mode) {
1531	case MPOL_PREFERRED:
1532		if (!(policy->flags & MPOL_F_LOCAL))
1533			nd = policy->v.preferred_node;
1534		break;
1535	case MPOL_BIND:
1536		/*
1537		 * Normally, MPOL_BIND allocations are node-local within the
1538		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1539		 * current node isn't part of the mask, we use the zonelist for
1540		 * the first node in the mask instead.
1541		 */
1542		if (unlikely(gfp & __GFP_THISNODE) &&
1543				unlikely(!node_isset(nd, policy->v.nodes)))
1544			nd = first_node(policy->v.nodes);
1545		break;
1546	default:
1547		BUG();
1548	}
1549	return node_zonelist(nd, gfp);
1550}
1551
1552/* Do dynamic interleaving for a process */
1553static unsigned interleave_nodes(struct mempolicy *policy)
1554{
1555	unsigned nid, next;
1556	struct task_struct *me = current;
1557
1558	nid = me->il_next;
1559	next = next_node(nid, policy->v.nodes);
1560	if (next >= MAX_NUMNODES)
1561		next = first_node(policy->v.nodes);
1562	if (next < MAX_NUMNODES)
1563		me->il_next = next;
1564	return nid;
1565}
1566
1567/*
1568 * Depending on the memory policy provide a node from which to allocate the
1569 * next slab entry.
1570 * @policy must be protected by freeing by the caller.  If @policy is
1571 * the current task's mempolicy, this protection is implicit, as only the
1572 * task can change it's policy.  The system default policy requires no
1573 * such protection.
1574 */
1575unsigned slab_node(struct mempolicy *policy)
1576{
1577	if (!policy || policy->flags & MPOL_F_LOCAL)
1578		return numa_node_id();
1579
1580	switch (policy->mode) {
1581	case MPOL_PREFERRED:
1582		/*
1583		 * handled MPOL_F_LOCAL above
1584		 */
1585		return policy->v.preferred_node;
1586
1587	case MPOL_INTERLEAVE:
1588		return interleave_nodes(policy);
1589
1590	case MPOL_BIND: {
1591		/*
1592		 * Follow bind policy behavior and start allocation at the
1593		 * first node.
1594		 */
1595		struct zonelist *zonelist;
1596		struct zone *zone;
1597		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1598		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1599		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1600							&policy->v.nodes,
1601							&zone);
1602		return zone ? zone->node : numa_node_id();
1603	}
1604
1605	default:
1606		BUG();
1607	}
1608}
1609
1610/* Do static interleaving for a VMA with known offset. */
1611static unsigned offset_il_node(struct mempolicy *pol,
1612		struct vm_area_struct *vma, unsigned long off)
1613{
1614	unsigned nnodes = nodes_weight(pol->v.nodes);
1615	unsigned target;
1616	int c;
1617	int nid = -1;
1618
1619	if (!nnodes)
1620		return numa_node_id();
1621	target = (unsigned int)off % nnodes;
1622	c = 0;
1623	do {
1624		nid = next_node(nid, pol->v.nodes);
1625		c++;
1626	} while (c <= target);
1627	return nid;
1628}
1629
1630/* Determine a node number for interleave */
1631static inline unsigned interleave_nid(struct mempolicy *pol,
1632		 struct vm_area_struct *vma, unsigned long addr, int shift)
1633{
1634	if (vma) {
1635		unsigned long off;
1636
1637		/*
1638		 * for small pages, there is no difference between
1639		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1640		 * for huge pages, since vm_pgoff is in units of small
1641		 * pages, we need to shift off the always 0 bits to get
1642		 * a useful offset.
1643		 */
1644		BUG_ON(shift < PAGE_SHIFT);
1645		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1646		off += (addr - vma->vm_start) >> shift;
1647		return offset_il_node(pol, vma, off);
1648	} else
1649		return interleave_nodes(pol);
1650}
1651
1652#ifdef CONFIG_HUGETLBFS
1653/*
1654 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1655 * @vma = virtual memory area whose policy is sought
1656 * @addr = address in @vma for shared policy lookup and interleave policy
1657 * @gfp_flags = for requested zone
1658 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1659 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1660 *
1661 * Returns a zonelist suitable for a huge page allocation and a pointer
1662 * to the struct mempolicy for conditional unref after allocation.
1663 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1664 * @nodemask for filtering the zonelist.
1665 *
1666 * Must be protected by get_mems_allowed()
1667 */
1668struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1669				gfp_t gfp_flags, struct mempolicy **mpol,
1670				nodemask_t **nodemask)
1671{
1672	struct zonelist *zl;
1673
1674	*mpol = get_vma_policy(current, vma, addr);
1675	*nodemask = NULL;	/* assume !MPOL_BIND */
1676
1677	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1678		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1679				huge_page_shift(hstate_vma(vma))), gfp_flags);
1680	} else {
1681		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1682		if ((*mpol)->mode == MPOL_BIND)
1683			*nodemask = &(*mpol)->v.nodes;
1684	}
1685	return zl;
1686}
1687
1688/*
1689 * init_nodemask_of_mempolicy
1690 *
1691 * If the current task's mempolicy is "default" [NULL], return 'false'
1692 * to indicate default policy.  Otherwise, extract the policy nodemask
1693 * for 'bind' or 'interleave' policy into the argument nodemask, or
1694 * initialize the argument nodemask to contain the single node for
1695 * 'preferred' or 'local' policy and return 'true' to indicate presence
1696 * of non-default mempolicy.
1697 *
1698 * We don't bother with reference counting the mempolicy [mpol_get/put]
1699 * because the current task is examining it's own mempolicy and a task's
1700 * mempolicy is only ever changed by the task itself.
1701 *
1702 * N.B., it is the caller's responsibility to free a returned nodemask.
1703 */
1704bool init_nodemask_of_mempolicy(nodemask_t *mask)
1705{
1706	struct mempolicy *mempolicy;
1707	int nid;
1708
1709	if (!(mask && current->mempolicy))
1710		return false;
1711
1712	task_lock(current);
1713	mempolicy = current->mempolicy;
1714	switch (mempolicy->mode) {
1715	case MPOL_PREFERRED:
1716		if (mempolicy->flags & MPOL_F_LOCAL)
1717			nid = numa_node_id();
1718		else
1719			nid = mempolicy->v.preferred_node;
1720		init_nodemask_of_node(mask, nid);
1721		break;
1722
1723	case MPOL_BIND:
1724		/* Fall through */
1725	case MPOL_INTERLEAVE:
1726		*mask =  mempolicy->v.nodes;
1727		break;
1728
1729	default:
1730		BUG();
1731	}
1732	task_unlock(current);
1733
1734	return true;
1735}
1736#endif
1737
1738/*
1739 * mempolicy_nodemask_intersects
1740 *
1741 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1742 * policy.  Otherwise, check for intersection between mask and the policy
1743 * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1744 * policy, always return true since it may allocate elsewhere on fallback.
1745 *
1746 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1747 */
1748bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1749					const nodemask_t *mask)
1750{
1751	struct mempolicy *mempolicy;
1752	bool ret = true;
1753
1754	if (!mask)
1755		return ret;
1756	task_lock(tsk);
1757	mempolicy = tsk->mempolicy;
1758	if (!mempolicy)
1759		goto out;
1760
1761	switch (mempolicy->mode) {
1762	case MPOL_PREFERRED:
1763		/*
1764		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1765		 * allocate from, they may fallback to other nodes when oom.
1766		 * Thus, it's possible for tsk to have allocated memory from
1767		 * nodes in mask.
1768		 */
1769		break;
1770	case MPOL_BIND:
1771	case MPOL_INTERLEAVE:
1772		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1773		break;
1774	default:
1775		BUG();
1776	}
1777out:
1778	task_unlock(tsk);
1779	return ret;
1780}
1781
1782/* Allocate a page in interleaved policy.
1783   Own path because it needs to do special accounting. */
1784static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1785					unsigned nid)
1786{
1787	struct zonelist *zl;
1788	struct page *page;
1789
1790	zl = node_zonelist(nid, gfp);
1791	page = __alloc_pages(gfp, order, zl);
1792	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1793		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1794	return page;
1795}
1796
1797/**
1798 * 	alloc_pages_vma	- Allocate a page for a VMA.
1799 *
1800 * 	@gfp:
1801 *      %GFP_USER    user allocation.
1802 *      %GFP_KERNEL  kernel allocations,
1803 *      %GFP_HIGHMEM highmem/user allocations,
1804 *      %GFP_FS      allocation should not call back into a file system.
1805 *      %GFP_ATOMIC  don't sleep.
1806 *
1807 *	@order:Order of the GFP allocation.
1808 * 	@vma:  Pointer to VMA or NULL if not available.
1809 *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1810 *
1811 * 	This function allocates a page from the kernel page pool and applies
1812 *	a NUMA policy associated with the VMA or the current process.
1813 *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1814 *	mm_struct of the VMA to prevent it from going away. Should be used for
1815 *	all allocations for pages that will be mapped into
1816 * 	user space. Returns NULL when no page can be allocated.
1817 *
1818 *	Should be called with the mm_sem of the vma hold.
1819 */
1820struct page *
1821alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1822		unsigned long addr, int node)
1823{
1824	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1825	struct zonelist *zl;
1826	struct page *page;
1827
1828	get_mems_allowed();
1829	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1830		unsigned nid;
1831
1832		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1833		mpol_cond_put(pol);
1834		page = alloc_page_interleave(gfp, order, nid);
1835		put_mems_allowed();
1836		return page;
1837	}
1838	zl = policy_zonelist(gfp, pol, node);
1839	if (unlikely(mpol_needs_cond_ref(pol))) {
1840		/*
1841		 * slow path: ref counted shared policy
1842		 */
1843		struct page *page =  __alloc_pages_nodemask(gfp, order,
1844						zl, policy_nodemask(gfp, pol));
1845		__mpol_put(pol);
1846		put_mems_allowed();
1847		return page;
1848	}
1849	/*
1850	 * fast path:  default or task policy
1851	 */
1852	page = __alloc_pages_nodemask(gfp, order, zl,
1853				      policy_nodemask(gfp, pol));
1854	put_mems_allowed();
1855	return page;
1856}
1857
1858/**
1859 * 	alloc_pages_current - Allocate pages.
1860 *
1861 *	@gfp:
1862 *		%GFP_USER   user allocation,
1863 *      	%GFP_KERNEL kernel allocation,
1864 *      	%GFP_HIGHMEM highmem allocation,
1865 *      	%GFP_FS     don't call back into a file system.
1866 *      	%GFP_ATOMIC don't sleep.
1867 *	@order: Power of two of allocation size in pages. 0 is a single page.
1868 *
1869 *	Allocate a page from the kernel page pool.  When not in
1870 *	interrupt context and apply the current process NUMA policy.
1871 *	Returns NULL when no page can be allocated.
1872 *
1873 *	Don't call cpuset_update_task_memory_state() unless
1874 *	1) it's ok to take cpuset_sem (can WAIT), and
1875 *	2) allocating for current task (not interrupt).
1876 */
1877struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1878{
1879	struct mempolicy *pol = current->mempolicy;
1880	struct page *page;
1881
1882	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1883		pol = &default_policy;
1884
1885	get_mems_allowed();
1886	/*
1887	 * No reference counting needed for current->mempolicy
1888	 * nor system default_policy
1889	 */
1890	if (pol->mode == MPOL_INTERLEAVE)
1891		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
1892	else
1893		page = __alloc_pages_nodemask(gfp, order,
1894				policy_zonelist(gfp, pol, numa_node_id()),
1895				policy_nodemask(gfp, pol));
1896	put_mems_allowed();
1897	return page;
1898}
1899EXPORT_SYMBOL(alloc_pages_current);
1900
1901/*
1902 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1903 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1904 * with the mems_allowed returned by cpuset_mems_allowed().  This
1905 * keeps mempolicies cpuset relative after its cpuset moves.  See
1906 * further kernel/cpuset.c update_nodemask().
1907 *
1908 * current's mempolicy may be rebinded by the other task(the task that changes
1909 * cpuset's mems), so we needn't do rebind work for current task.
1910 */
1911
1912/* Slow path of a mempolicy duplicate */
1913struct mempolicy *__mpol_dup(struct mempolicy *old)
1914{
1915	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1916
1917	if (!new)
1918		return ERR_PTR(-ENOMEM);
1919
1920	/* task's mempolicy is protected by alloc_lock */
1921	if (old == current->mempolicy) {
1922		task_lock(current);
1923		*new = *old;
1924		task_unlock(current);
1925	} else
1926		*new = *old;
1927
1928	rcu_read_lock();
1929	if (current_cpuset_is_being_rebound()) {
1930		nodemask_t mems = cpuset_mems_allowed(current);
1931		if (new->flags & MPOL_F_REBINDING)
1932			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
1933		else
1934			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
1935	}
1936	rcu_read_unlock();
1937	atomic_set(&new->refcnt, 1);
1938	return new;
1939}
1940
1941/*
1942 * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1943 * eliminate the * MPOL_F_* flags that require conditional ref and
1944 * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1945 * after return.  Use the returned value.
1946 *
1947 * Allows use of a mempolicy for, e.g., multiple allocations with a single
1948 * policy lookup, even if the policy needs/has extra ref on lookup.
1949 * shmem_readahead needs this.
1950 */
1951struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1952						struct mempolicy *frompol)
1953{
1954	if (!mpol_needs_cond_ref(frompol))
1955		return frompol;
1956
1957	*tompol = *frompol;
1958	tompol->flags &= ~MPOL_F_SHARED;	/* copy doesn't need unref */
1959	__mpol_put(frompol);
1960	return tompol;
1961}
1962
1963/* Slow path of a mempolicy comparison */
1964int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1965{
1966	if (!a || !b)
1967		return 0;
1968	if (a->mode != b->mode)
1969		return 0;
1970	if (a->flags != b->flags)
1971		return 0;
1972	if (mpol_store_user_nodemask(a))
1973		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
1974			return 0;
1975
1976	switch (a->mode) {
1977	case MPOL_BIND:
1978		/* Fall through */
1979	case MPOL_INTERLEAVE:
1980		return nodes_equal(a->v.nodes, b->v.nodes);
1981	case MPOL_PREFERRED:
1982		return a->v.preferred_node == b->v.preferred_node;
1983	default:
1984		BUG();
1985		return 0;
1986	}
1987}
1988
1989/*
1990 * Shared memory backing store policy support.
1991 *
1992 * Remember policies even when nobody has shared memory mapped.
1993 * The policies are kept in Red-Black tree linked from the inode.
1994 * They are protected by the sp->lock spinlock, which should be held
1995 * for any accesses to the tree.
1996 */
1997
1998/* lookup first element intersecting start-end */
1999/* Caller holds sp->lock */
2000static struct sp_node *
2001sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2002{
2003	struct rb_node *n = sp->root.rb_node;
2004
2005	while (n) {
2006		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2007
2008		if (start >= p->end)
2009			n = n->rb_right;
2010		else if (end <= p->start)
2011			n = n->rb_left;
2012		else
2013			break;
2014	}
2015	if (!n)
2016		return NULL;
2017	for (;;) {
2018		struct sp_node *w = NULL;
2019		struct rb_node *prev = rb_prev(n);
2020		if (!prev)
2021			break;
2022		w = rb_entry(prev, struct sp_node, nd);
2023		if (w->end <= start)
2024			break;
2025		n = prev;
2026	}
2027	return rb_entry(n, struct sp_node, nd);
2028}
2029
2030/* Insert a new shared policy into the list. */
2031/* Caller holds sp->lock */
2032static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2033{
2034	struct rb_node **p = &sp->root.rb_node;
2035	struct rb_node *parent = NULL;
2036	struct sp_node *nd;
2037
2038	while (*p) {
2039		parent = *p;
2040		nd = rb_entry(parent, struct sp_node, nd);
2041		if (new->start < nd->start)
2042			p = &(*p)->rb_left;
2043		else if (new->end > nd->end)
2044			p = &(*p)->rb_right;
2045		else
2046			BUG();
2047	}
2048	rb_link_node(&new->nd, parent, p);
2049	rb_insert_color(&new->nd, &sp->root);
2050	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2051		 new->policy ? new->policy->mode : 0);
2052}
2053
2054/* Find shared policy intersecting idx */
2055struct mempolicy *
2056mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2057{
2058	struct mempolicy *pol = NULL;
2059	struct sp_node *sn;
2060
2061	if (!sp->root.rb_node)
2062		return NULL;
2063	spin_lock(&sp->lock);
2064	sn = sp_lookup(sp, idx, idx+1);
2065	if (sn) {
2066		mpol_get(sn->policy);
2067		pol = sn->policy;
2068	}
2069	spin_unlock(&sp->lock);
2070	return pol;
2071}
2072
2073static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2074{
2075	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2076	rb_erase(&n->nd, &sp->root);
2077	mpol_put(n->policy);
2078	kmem_cache_free(sn_cache, n);
2079}
2080
2081static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2082				struct mempolicy *pol)
2083{
2084	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2085
2086	if (!n)
2087		return NULL;
2088	n->start = start;
2089	n->end = end;
2090	mpol_get(pol);
2091	pol->flags |= MPOL_F_SHARED;	/* for unref */
2092	n->policy = pol;
2093	return n;
2094}
2095
2096/* Replace a policy range. */
2097static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2098				 unsigned long end, struct sp_node *new)
2099{
2100	struct sp_node *n, *new2 = NULL;
2101
2102restart:
2103	spin_lock(&sp->lock);
2104	n = sp_lookup(sp, start, end);
2105	/* Take care of old policies in the same range. */
2106	while (n && n->start < end) {
2107		struct rb_node *next = rb_next(&n->nd);
2108		if (n->start >= start) {
2109			if (n->end <= end)
2110				sp_delete(sp, n);
2111			else
2112				n->start = end;
2113		} else {
2114			/* Old policy spanning whole new range. */
2115			if (n->end > end) {
2116				if (!new2) {
2117					spin_unlock(&sp->lock);
2118					new2 = sp_alloc(end, n->end, n->policy);
2119					if (!new2)
2120						return -ENOMEM;
2121					goto restart;
2122				}
2123				n->end = start;
2124				sp_insert(sp, new2);
2125				new2 = NULL;
2126				break;
2127			} else
2128				n->end = start;
2129		}
2130		if (!next)
2131			break;
2132		n = rb_entry(next, struct sp_node, nd);
2133	}
2134	if (new)
2135		sp_insert(sp, new);
2136	spin_unlock(&sp->lock);
2137	if (new2) {
2138		mpol_put(new2->policy);
2139		kmem_cache_free(sn_cache, new2);
2140	}
2141	return 0;
2142}
2143
2144/**
2145 * mpol_shared_policy_init - initialize shared policy for inode
2146 * @sp: pointer to inode shared policy
2147 * @mpol:  struct mempolicy to install
2148 *
2149 * Install non-NULL @mpol in inode's shared policy rb-tree.
2150 * On entry, the current task has a reference on a non-NULL @mpol.
2151 * This must be released on exit.
2152 * This is called at get_inode() calls and we can use GFP_KERNEL.
2153 */
2154void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2155{
2156	int ret;
2157
2158	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2159	spin_lock_init(&sp->lock);
2160
2161	if (mpol) {
2162		struct vm_area_struct pvma;
2163		struct mempolicy *new;
2164		NODEMASK_SCRATCH(scratch);
2165
2166		if (!scratch)
2167			goto put_mpol;
2168		/* contextualize the tmpfs mount point mempolicy */
2169		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2170		if (IS_ERR(new))
2171			goto free_scratch; /* no valid nodemask intersection */
2172
2173		task_lock(current);
2174		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2175		task_unlock(current);
2176		if (ret)
2177			goto put_new;
2178
2179		/* Create pseudo-vma that contains just the policy */
2180		memset(&pvma, 0, sizeof(struct vm_area_struct));
2181		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2182		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2183
2184put_new:
2185		mpol_put(new);			/* drop initial ref */
2186free_scratch:
2187		NODEMASK_SCRATCH_FREE(scratch);
2188put_mpol:
2189		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2190	}
2191}
2192
2193int mpol_set_shared_policy(struct shared_policy *info,
2194			struct vm_area_struct *vma, struct mempolicy *npol)
2195{
2196	int err;
2197	struct sp_node *new = NULL;
2198	unsigned long sz = vma_pages(vma);
2199
2200	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2201		 vma->vm_pgoff,
2202		 sz, npol ? npol->mode : -1,
2203		 npol ? npol->flags : -1,
2204		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
2205
2206	if (npol) {
2207		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2208		if (!new)
2209			return -ENOMEM;
2210	}
2211	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2212	if (err && new)
2213		kmem_cache_free(sn_cache, new);
2214	return err;
2215}
2216
2217/* Free a backing policy store on inode delete. */
2218void mpol_free_shared_policy(struct shared_policy *p)
2219{
2220	struct sp_node *n;
2221	struct rb_node *next;
2222
2223	if (!p->root.rb_node)
2224		return;
2225	spin_lock(&p->lock);
2226	next = rb_first(&p->root);
2227	while (next) {
2228		n = rb_entry(next, struct sp_node, nd);
2229		next = rb_next(&n->nd);
2230		rb_erase(&n->nd, &p->root);
2231		mpol_put(n->policy);
2232		kmem_cache_free(sn_cache, n);
2233	}
2234	spin_unlock(&p->lock);
2235}
2236
2237/* assumes fs == KERNEL_DS */
2238void __init numa_policy_init(void)
2239{
2240	nodemask_t interleave_nodes;
2241	unsigned long largest = 0;
2242	int nid, prefer = 0;
2243
2244	policy_cache = kmem_cache_create("numa_policy",
2245					 sizeof(struct mempolicy),
2246					 0, SLAB_PANIC, NULL);
2247
2248	sn_cache = kmem_cache_create("shared_policy_node",
2249				     sizeof(struct sp_node),
2250				     0, SLAB_PANIC, NULL);
2251
2252	/*
2253	 * Set interleaving policy for system init. Interleaving is only
2254	 * enabled across suitably sized nodes (default is >= 16MB), or
2255	 * fall back to the largest node if they're all smaller.
2256	 */
2257	nodes_clear(interleave_nodes);
2258	for_each_node_state(nid, N_HIGH_MEMORY) {
2259		unsigned long total_pages = node_present_pages(nid);
2260
2261		/* Preserve the largest node */
2262		if (largest < total_pages) {
2263			largest = total_pages;
2264			prefer = nid;
2265		}
2266
2267		/* Interleave this node? */
2268		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2269			node_set(nid, interleave_nodes);
2270	}
2271
2272	/* All too small, use the largest */
2273	if (unlikely(nodes_empty(interleave_nodes)))
2274		node_set(prefer, interleave_nodes);
2275
2276	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2277		printk("numa_policy_init: interleaving failed\n");
2278}
2279
2280/* Reset policy of current process to default */
2281void numa_default_policy(void)
2282{
2283	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2284}
2285
2286/*
2287 * Parse and format mempolicy from/to strings
2288 */
2289
2290/*
2291 * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
2292 * Used only for mpol_parse_str() and mpol_to_str()
2293 */
2294#define MPOL_LOCAL MPOL_MAX
2295static const char * const policy_modes[] =
2296{
2297	[MPOL_DEFAULT]    = "default",
2298	[MPOL_PREFERRED]  = "prefer",
2299	[MPOL_BIND]       = "bind",
2300	[MPOL_INTERLEAVE] = "interleave",
2301	[MPOL_LOCAL]      = "local"
2302};
2303
2304
2305#ifdef CONFIG_TMPFS
2306/**
2307 * mpol_parse_str - parse string to mempolicy
2308 * @str:  string containing mempolicy to parse
2309 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2310 * @no_context:  flag whether to "contextualize" the mempolicy
2311 *
2312 * Format of input:
2313 *	<mode>[=<flags>][:<nodelist>]
2314 *
2315 * if @no_context is true, save the input nodemask in w.user_nodemask in
2316 * the returned mempolicy.  This will be used to "clone" the mempolicy in
2317 * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
2318 * mount option.  Note that if 'static' or 'relative' mode flags were
2319 * specified, the input nodemask will already have been saved.  Saving
2320 * it again is redundant, but safe.
2321 *
2322 * On success, returns 0, else 1
2323 */
2324int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2325{
2326	struct mempolicy *new = NULL;
2327	unsigned short mode;
2328	unsigned short uninitialized_var(mode_flags);
2329	nodemask_t nodes;
2330	char *nodelist = strchr(str, ':');
2331	char *flags = strchr(str, '=');
2332	int err = 1;
2333
2334	if (nodelist) {
2335		/* NUL-terminate mode or flags string */
2336		*nodelist++ = '\0';
2337		if (nodelist_parse(nodelist, nodes))
2338			goto out;
2339		if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2340			goto out;
2341	} else
2342		nodes_clear(nodes);
2343
2344	if (flags)
2345		*flags++ = '\0';	/* terminate mode string */
2346
2347	for (mode = 0; mode <= MPOL_LOCAL; mode++) {
2348		if (!strcmp(str, policy_modes[mode])) {
2349			break;
2350		}
2351	}
2352	if (mode > MPOL_LOCAL)
2353		goto out;
2354
2355	switch (mode) {
2356	case MPOL_PREFERRED:
2357		/*
2358		 * Insist on a nodelist of one node only
2359		 */
2360		if (nodelist) {
2361			char *rest = nodelist;
2362			while (isdigit(*rest))
2363				rest++;
2364			if (*rest)
2365				goto out;
2366		}
2367		break;
2368	case MPOL_INTERLEAVE:
2369		/*
2370		 * Default to online nodes with memory if no nodelist
2371		 */
2372		if (!nodelist)
2373			nodes = node_states[N_HIGH_MEMORY];
2374		break;
2375	case MPOL_LOCAL:
2376		/*
2377		 * Don't allow a nodelist;  mpol_new() checks flags
2378		 */
2379		if (nodelist)
2380			goto out;
2381		mode = MPOL_PREFERRED;
2382		break;
2383	case MPOL_DEFAULT:
2384		/*
2385		 * Insist on a empty nodelist
2386		 */
2387		if (!nodelist)
2388			err = 0;
2389		goto out;
2390	case MPOL_BIND:
2391		/*
2392		 * Insist on a nodelist
2393		 */
2394		if (!nodelist)
2395			goto out;
2396	}
2397
2398	mode_flags = 0;
2399	if (flags) {
2400		/*
2401		 * Currently, we only support two mutually exclusive
2402		 * mode flags.
2403		 */
2404		if (!strcmp(flags, "static"))
2405			mode_flags |= MPOL_F_STATIC_NODES;
2406		else if (!strcmp(flags, "relative"))
2407			mode_flags |= MPOL_F_RELATIVE_NODES;
2408		else
2409			goto out;
2410	}
2411
2412	new = mpol_new(mode, mode_flags, &nodes);
2413	if (IS_ERR(new))
2414		goto out;
2415
2416	if (no_context) {
2417		/* save for contextualization */
2418		new->w.user_nodemask = nodes;
2419	} else {
2420		int ret;
2421		NODEMASK_SCRATCH(scratch);
2422		if (scratch) {
2423			task_lock(current);
2424			ret = mpol_set_nodemask(new, &nodes, scratch);
2425			task_unlock(current);
2426		} else
2427			ret = -ENOMEM;
2428		NODEMASK_SCRATCH_FREE(scratch);
2429		if (ret) {
2430			mpol_put(new);
2431			goto out;
2432		}
2433	}
2434	err = 0;
2435
2436out:
2437	/* Restore string for error message */
2438	if (nodelist)
2439		*--nodelist = ':';
2440	if (flags)
2441		*--flags = '=';
2442	if (!err)
2443		*mpol = new;
2444	return err;
2445}
2446#endif /* CONFIG_TMPFS */
2447
2448/**
2449 * mpol_to_str - format a mempolicy structure for printing
2450 * @buffer:  to contain formatted mempolicy string
2451 * @maxlen:  length of @buffer
2452 * @pol:  pointer to mempolicy to be formatted
2453 * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2454 *
2455 * Convert a mempolicy into a string.
2456 * Returns the number of characters in buffer (if positive)
2457 * or an error (negative)
2458 */
2459int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2460{
2461	char *p = buffer;
2462	int l;
2463	nodemask_t nodes;
2464	unsigned short mode;
2465	unsigned short flags = pol ? pol->flags : 0;
2466
2467	/*
2468	 * Sanity check:  room for longest mode, flag and some nodes
2469	 */
2470	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2471
2472	if (!pol || pol == &default_policy)
2473		mode = MPOL_DEFAULT;
2474	else
2475		mode = pol->mode;
2476
2477	switch (mode) {
2478	case MPOL_DEFAULT:
2479		nodes_clear(nodes);
2480		break;
2481
2482	case MPOL_PREFERRED:
2483		nodes_clear(nodes);
2484		if (flags & MPOL_F_LOCAL)
2485			mode = MPOL_LOCAL;	/* pseudo-policy */
2486		else
2487			node_set(pol->v.preferred_node, nodes);
2488		break;
2489
2490	case MPOL_BIND:
2491		/* Fall through */
2492	case MPOL_INTERLEAVE:
2493		if (no_context)
2494			nodes = pol->w.user_nodemask;
2495		else
2496			nodes = pol->v.nodes;
2497		break;
2498
2499	default:
2500		BUG();
2501	}
2502
2503	l = strlen(policy_modes[mode]);
2504	if (buffer + maxlen < p + l + 1)
2505		return -ENOSPC;
2506
2507	strcpy(p, policy_modes[mode]);
2508	p += l;
2509
2510	if (flags & MPOL_MODE_FLAGS) {
2511		if (buffer + maxlen < p + 2)
2512			return -ENOSPC;
2513		*p++ = '=';
2514
2515		/*
2516		 * Currently, the only defined flags are mutually exclusive
2517		 */
2518		if (flags & MPOL_F_STATIC_NODES)
2519			p += snprintf(p, buffer + maxlen - p, "static");
2520		else if (flags & MPOL_F_RELATIVE_NODES)
2521			p += snprintf(p, buffer + maxlen - p, "relative");
2522	}
2523
2524	if (!nodes_empty(nodes)) {
2525		if (buffer + maxlen < p + 2)
2526			return -ENOSPC;
2527		*p++ = ':';
2528	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2529	}
2530	return p - buffer;
2531}
2532
2533struct numa_maps {
2534	unsigned long pages;
2535	unsigned long anon;
2536	unsigned long active;
2537	unsigned long writeback;
2538	unsigned long mapcount_max;
2539	unsigned long dirty;
2540	unsigned long swapcache;
2541	unsigned long node[MAX_NUMNODES];
2542};
2543
2544static void gather_stats(struct page *page, void *private, int pte_dirty)
2545{
2546	struct numa_maps *md = private;
2547	int count = page_mapcount(page);
2548
2549	md->pages++;
2550	if (pte_dirty || PageDirty(page))
2551		md->dirty++;
2552
2553	if (PageSwapCache(page))
2554		md->swapcache++;
2555
2556	if (PageActive(page) || PageUnevictable(page))
2557		md->active++;
2558
2559	if (PageWriteback(page))
2560		md->writeback++;
2561
2562	if (PageAnon(page))
2563		md->anon++;
2564
2565	if (count > md->mapcount_max)
2566		md->mapcount_max = count;
2567
2568	md->node[page_to_nid(page)]++;
2569}
2570
2571#ifdef CONFIG_HUGETLB_PAGE
2572static void check_huge_range(struct vm_area_struct *vma,
2573		unsigned long start, unsigned long end,
2574		struct numa_maps *md)
2575{
2576	unsigned long addr;
2577	struct page *page;
2578	struct hstate *h = hstate_vma(vma);
2579	unsigned long sz = huge_page_size(h);
2580
2581	for (addr = start; addr < end; addr += sz) {
2582		pte_t *ptep = huge_pte_offset(vma->vm_mm,
2583						addr & huge_page_mask(h));
2584		pte_t pte;
2585
2586		if (!ptep)
2587			continue;
2588
2589		pte = *ptep;
2590		if (pte_none(pte))
2591			continue;
2592
2593		page = pte_page(pte);
2594		if (!page)
2595			continue;
2596
2597		gather_stats(page, md, pte_dirty(*ptep));
2598	}
2599}
2600#else
2601static inline void check_huge_range(struct vm_area_struct *vma,
2602		unsigned long start, unsigned long end,
2603		struct numa_maps *md)
2604{
2605}
2606#endif
2607
2608/*
2609 * Display pages allocated per node and memory policy via /proc.
2610 */
2611int show_numa_map(struct seq_file *m, void *v)
2612{
2613	struct proc_maps_private *priv = m->private;
2614	struct vm_area_struct *vma = v;
2615	struct numa_maps *md;
2616	struct file *file = vma->vm_file;
2617	struct mm_struct *mm = vma->vm_mm;
2618	struct mempolicy *pol;
2619	int n;
2620	char buffer[50];
2621
2622	if (!mm)
2623		return 0;
2624
2625	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2626	if (!md)
2627		return 0;
2628
2629	pol = get_vma_policy(priv->task, vma, vma->vm_start);
2630	mpol_to_str(buffer, sizeof(buffer), pol, 0);
2631	mpol_cond_put(pol);
2632
2633	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2634
2635	if (file) {
2636		seq_printf(m, " file=");
2637		seq_path(m, &file->f_path, "\n\t= ");
2638	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2639		seq_printf(m, " heap");
2640	} else if (vma->vm_start <= mm->start_stack &&
2641			vma->vm_end >= mm->start_stack) {
2642		seq_printf(m, " stack");
2643	}
2644
2645	if (is_vm_hugetlb_page(vma)) {
2646		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2647		seq_printf(m, " huge");
2648	} else {
2649		check_pgd_range(vma, vma->vm_start, vma->vm_end,
2650			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2651	}
2652
2653	if (!md->pages)
2654		goto out;
2655
2656	if (md->anon)
2657		seq_printf(m," anon=%lu",md->anon);
2658
2659	if (md->dirty)
2660		seq_printf(m," dirty=%lu",md->dirty);
2661
2662	if (md->pages != md->anon && md->pages != md->dirty)
2663		seq_printf(m, " mapped=%lu", md->pages);
2664
2665	if (md->mapcount_max > 1)
2666		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2667
2668	if (md->swapcache)
2669		seq_printf(m," swapcache=%lu", md->swapcache);
2670
2671	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2672		seq_printf(m," active=%lu", md->active);
2673
2674	if (md->writeback)
2675		seq_printf(m," writeback=%lu", md->writeback);
2676
2677	for_each_node_state(n, N_HIGH_MEMORY)
2678		if (md->node[n])
2679			seq_printf(m, " N%d=%lu", n, md->node[n]);
2680out:
2681	seq_putc(m, '\n');
2682	kfree(md);
2683
2684	if (m->count < m->size)
2685		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2686	return 0;
2687}
2688