1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave     Allocate memory interleaved over a set of nodes,
16 *                with normal fallback if it fails.
17 *                For VMA based allocations this interleaves based on the
18 *                offset into the backing object or offset into the mapping
19 *                for anonymous memory. For process policy an process counter
20 *                is used.
21 *
22 * bind           Only allocate memory on a specific set of nodes,
23 *                no fallback.
24 *                FIXME: memory is allocated starting with the first node
25 *                to the last. It would be better if bind would truly restrict
26 *                the allocation to memory nodes instead
27 *
28 * preferred       Try a specific node first before normal fallback.
29 *                As a special case NUMA_NO_NODE here means do the allocation
30 *                on the local CPU. This is normally identical to default,
31 *                but useful to set in a VMA when you have a non default
32 *                process policy.
33 *
34 * default        Allocate on the local node first, or when on a VMA
35 *                use the process policy. This is what Linux always did
36 *		  in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57   fix mmap readahead to honour policy and enable policy for any page cache
58   object
59   statistics for bigpages
60   global policy for page cache? currently it uses process policy. Requires
61   first item above.
62   handle mremap for shared memory (currently ignored for the policy)
63   grows down?
64   make bind policy root only? It can trigger oom much faster and the
65   kernel is not always grateful with that.
66*/
67
68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69
70#include <linux/mempolicy.h>
71#include <linux/mm.h>
72#include <linux/highmem.h>
73#include <linux/hugetlb.h>
74#include <linux/kernel.h>
75#include <linux/sched.h>
76#include <linux/nodemask.h>
77#include <linux/cpuset.h>
78#include <linux/slab.h>
79#include <linux/string.h>
80#include <linux/export.h>
81#include <linux/nsproxy.h>
82#include <linux/interrupt.h>
83#include <linux/init.h>
84#include <linux/compat.h>
85#include <linux/swap.h>
86#include <linux/seq_file.h>
87#include <linux/proc_fs.h>
88#include <linux/migrate.h>
89#include <linux/ksm.h>
90#include <linux/rmap.h>
91#include <linux/security.h>
92#include <linux/syscalls.h>
93#include <linux/ctype.h>
94#include <linux/mm_inline.h>
95#include <linux/mmu_notifier.h>
96#include <linux/printk.h>
97
98#include <asm/tlbflush.h>
99#include <asm/uaccess.h>
100#include <linux/random.h>
101
102#include "internal.h"
103
104/* Internal flags */
105#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
106#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
107
108static struct kmem_cache *policy_cache;
109static struct kmem_cache *sn_cache;
110
111/* Highest zone. An specific allocation for a zone below that is not
112   policied. */
113enum zone_type policy_zone = 0;
114
115/*
116 * run-time system-wide default policy => local allocation
117 */
118static struct mempolicy default_policy = {
119	.refcnt = ATOMIC_INIT(1), /* never free it */
120	.mode = MPOL_PREFERRED,
121	.flags = MPOL_F_LOCAL,
122};
123
124static struct mempolicy preferred_node_policy[MAX_NUMNODES];
125
126struct mempolicy *get_task_policy(struct task_struct *p)
127{
128	struct mempolicy *pol = p->mempolicy;
129	int node;
130
131	if (pol)
132		return pol;
133
134	node = numa_node_id();
135	if (node != NUMA_NO_NODE) {
136		pol = &preferred_node_policy[node];
137		/* preferred_node_policy is not initialised early in boot */
138		if (pol->mode)
139			return pol;
140	}
141
142	return &default_policy;
143}
144
145static const struct mempolicy_operations {
146	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
147	/*
148	 * If read-side task has no lock to protect task->mempolicy, write-side
149	 * task will rebind the task->mempolicy by two step. The first step is
150	 * setting all the newly nodes, and the second step is cleaning all the
151	 * disallowed nodes. In this way, we can avoid finding no node to alloc
152	 * page.
153	 * If we have a lock to protect task->mempolicy in read-side, we do
154	 * rebind directly.
155	 *
156	 * step:
157	 * 	MPOL_REBIND_ONCE - do rebind work at once
158	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
159	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
160	 */
161	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
162			enum mpol_rebind_step step);
163} mpol_ops[MPOL_MAX];
164
165/* Check that the nodemask contains at least one populated zone */
166static int is_valid_nodemask(const nodemask_t *nodemask)
167{
168	return nodes_intersects(*nodemask, node_states[N_MEMORY]);
169}
170
171static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
172{
173	return pol->flags & MPOL_MODE_FLAGS;
174}
175
176static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
177				   const nodemask_t *rel)
178{
179	nodemask_t tmp;
180	nodes_fold(tmp, *orig, nodes_weight(*rel));
181	nodes_onto(*ret, tmp, *rel);
182}
183
184static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
185{
186	if (nodes_empty(*nodes))
187		return -EINVAL;
188	pol->v.nodes = *nodes;
189	return 0;
190}
191
192static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
193{
194	if (!nodes)
195		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
196	else if (nodes_empty(*nodes))
197		return -EINVAL;			/*  no allowed nodes */
198	else
199		pol->v.preferred_node = first_node(*nodes);
200	return 0;
201}
202
203static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
204{
205	if (!is_valid_nodemask(nodes))
206		return -EINVAL;
207	pol->v.nodes = *nodes;
208	return 0;
209}
210
211/*
212 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
213 * any, for the new policy.  mpol_new() has already validated the nodes
214 * parameter with respect to the policy mode and flags.  But, we need to
215 * handle an empty nodemask with MPOL_PREFERRED here.
216 *
217 * Must be called holding task's alloc_lock to protect task's mems_allowed
218 * and mempolicy.  May also be called holding the mmap_semaphore for write.
219 */
220static int mpol_set_nodemask(struct mempolicy *pol,
221		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
222{
223	int ret;
224
225	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
226	if (pol == NULL)
227		return 0;
228	/* Check N_MEMORY */
229	nodes_and(nsc->mask1,
230		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
231
232	VM_BUG_ON(!nodes);
233	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
234		nodes = NULL;	/* explicit local allocation */
235	else {
236		if (pol->flags & MPOL_F_RELATIVE_NODES)
237			mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
238		else
239			nodes_and(nsc->mask2, *nodes, nsc->mask1);
240
241		if (mpol_store_user_nodemask(pol))
242			pol->w.user_nodemask = *nodes;
243		else
244			pol->w.cpuset_mems_allowed =
245						cpuset_current_mems_allowed;
246	}
247
248	if (nodes)
249		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
250	else
251		ret = mpol_ops[pol->mode].create(pol, NULL);
252	return ret;
253}
254
255/*
256 * This function just creates a new policy, does some check and simple
257 * initialization. You must invoke mpol_set_nodemask() to set nodes.
258 */
259static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
260				  nodemask_t *nodes)
261{
262	struct mempolicy *policy;
263
264	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
265		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
266
267	if (mode == MPOL_DEFAULT) {
268		if (nodes && !nodes_empty(*nodes))
269			return ERR_PTR(-EINVAL);
270		return NULL;
271	}
272	VM_BUG_ON(!nodes);
273
274	/*
275	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
276	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
277	 * All other modes require a valid pointer to a non-empty nodemask.
278	 */
279	if (mode == MPOL_PREFERRED) {
280		if (nodes_empty(*nodes)) {
281			if (((flags & MPOL_F_STATIC_NODES) ||
282			     (flags & MPOL_F_RELATIVE_NODES)))
283				return ERR_PTR(-EINVAL);
284		}
285	} else if (mode == MPOL_LOCAL) {
286		if (!nodes_empty(*nodes))
287			return ERR_PTR(-EINVAL);
288		mode = MPOL_PREFERRED;
289	} else if (nodes_empty(*nodes))
290		return ERR_PTR(-EINVAL);
291	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
292	if (!policy)
293		return ERR_PTR(-ENOMEM);
294	atomic_set(&policy->refcnt, 1);
295	policy->mode = mode;
296	policy->flags = flags;
297
298	return policy;
299}
300
301/* Slow path of a mpol destructor. */
302void __mpol_put(struct mempolicy *p)
303{
304	if (!atomic_dec_and_test(&p->refcnt))
305		return;
306	kmem_cache_free(policy_cache, p);
307}
308
309static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
310				enum mpol_rebind_step step)
311{
312}
313
314/*
315 * step:
316 * 	MPOL_REBIND_ONCE  - do rebind work at once
317 * 	MPOL_REBIND_STEP1 - set all the newly nodes
318 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
319 */
320static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
321				 enum mpol_rebind_step step)
322{
323	nodemask_t tmp;
324
325	if (pol->flags & MPOL_F_STATIC_NODES)
326		nodes_and(tmp, pol->w.user_nodemask, *nodes);
327	else if (pol->flags & MPOL_F_RELATIVE_NODES)
328		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
329	else {
330		/*
331		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
332		 * result
333		 */
334		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
335			nodes_remap(tmp, pol->v.nodes,
336					pol->w.cpuset_mems_allowed, *nodes);
337			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
338		} else if (step == MPOL_REBIND_STEP2) {
339			tmp = pol->w.cpuset_mems_allowed;
340			pol->w.cpuset_mems_allowed = *nodes;
341		} else
342			BUG();
343	}
344
345	if (nodes_empty(tmp))
346		tmp = *nodes;
347
348	if (step == MPOL_REBIND_STEP1)
349		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
350	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
351		pol->v.nodes = tmp;
352	else
353		BUG();
354
355	if (!node_isset(current->il_next, tmp)) {
356		current->il_next = next_node(current->il_next, tmp);
357		if (current->il_next >= MAX_NUMNODES)
358			current->il_next = first_node(tmp);
359		if (current->il_next >= MAX_NUMNODES)
360			current->il_next = numa_node_id();
361	}
362}
363
364static void mpol_rebind_preferred(struct mempolicy *pol,
365				  const nodemask_t *nodes,
366				  enum mpol_rebind_step step)
367{
368	nodemask_t tmp;
369
370	if (pol->flags & MPOL_F_STATIC_NODES) {
371		int node = first_node(pol->w.user_nodemask);
372
373		if (node_isset(node, *nodes)) {
374			pol->v.preferred_node = node;
375			pol->flags &= ~MPOL_F_LOCAL;
376		} else
377			pol->flags |= MPOL_F_LOCAL;
378	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
379		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
380		pol->v.preferred_node = first_node(tmp);
381	} else if (!(pol->flags & MPOL_F_LOCAL)) {
382		pol->v.preferred_node = node_remap(pol->v.preferred_node,
383						   pol->w.cpuset_mems_allowed,
384						   *nodes);
385		pol->w.cpuset_mems_allowed = *nodes;
386	}
387}
388
389/*
390 * mpol_rebind_policy - Migrate a policy to a different set of nodes
391 *
392 * If read-side task has no lock to protect task->mempolicy, write-side
393 * task will rebind the task->mempolicy by two step. The first step is
394 * setting all the newly nodes, and the second step is cleaning all the
395 * disallowed nodes. In this way, we can avoid finding no node to alloc
396 * page.
397 * If we have a lock to protect task->mempolicy in read-side, we do
398 * rebind directly.
399 *
400 * step:
401 * 	MPOL_REBIND_ONCE  - do rebind work at once
402 * 	MPOL_REBIND_STEP1 - set all the newly nodes
403 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
404 */
405static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
406				enum mpol_rebind_step step)
407{
408	if (!pol)
409		return;
410	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
411	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
412		return;
413
414	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
415		return;
416
417	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
418		BUG();
419
420	if (step == MPOL_REBIND_STEP1)
421		pol->flags |= MPOL_F_REBINDING;
422	else if (step == MPOL_REBIND_STEP2)
423		pol->flags &= ~MPOL_F_REBINDING;
424	else if (step >= MPOL_REBIND_NSTEP)
425		BUG();
426
427	mpol_ops[pol->mode].rebind(pol, newmask, step);
428}
429
430/*
431 * Wrapper for mpol_rebind_policy() that just requires task
432 * pointer, and updates task mempolicy.
433 *
434 * Called with task's alloc_lock held.
435 */
436
437void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
438			enum mpol_rebind_step step)
439{
440	mpol_rebind_policy(tsk->mempolicy, new, step);
441}
442
443/*
444 * Rebind each vma in mm to new nodemask.
445 *
446 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
447 */
448
449void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
450{
451	struct vm_area_struct *vma;
452
453	down_write(&mm->mmap_sem);
454	for (vma = mm->mmap; vma; vma = vma->vm_next)
455		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
456	up_write(&mm->mmap_sem);
457}
458
459static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
460	[MPOL_DEFAULT] = {
461		.rebind = mpol_rebind_default,
462	},
463	[MPOL_INTERLEAVE] = {
464		.create = mpol_new_interleave,
465		.rebind = mpol_rebind_nodemask,
466	},
467	[MPOL_PREFERRED] = {
468		.create = mpol_new_preferred,
469		.rebind = mpol_rebind_preferred,
470	},
471	[MPOL_BIND] = {
472		.create = mpol_new_bind,
473		.rebind = mpol_rebind_nodemask,
474	},
475};
476
477static void migrate_page_add(struct page *page, struct list_head *pagelist,
478				unsigned long flags);
479
480/*
481 * Scan through pages checking if pages follow certain conditions,
482 * and move them to the pagelist if they do.
483 */
484static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
485		unsigned long addr, unsigned long end,
486		const nodemask_t *nodes, unsigned long flags,
487		void *private)
488{
489	pte_t *orig_pte;
490	pte_t *pte;
491	spinlock_t *ptl;
492
493	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
494	do {
495		struct page *page;
496		int nid;
497
498		if (!pte_present(*pte))
499			continue;
500		page = vm_normal_page(vma, addr, *pte);
501		if (!page)
502			continue;
503		/*
504		 * vm_normal_page() filters out zero pages, but there might
505		 * still be PageReserved pages to skip, perhaps in a VDSO.
506		 */
507		if (PageReserved(page))
508			continue;
509		nid = page_to_nid(page);
510		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
511			continue;
512
513		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
514			migrate_page_add(page, private, flags);
515		else
516			break;
517	} while (pte++, addr += PAGE_SIZE, addr != end);
518	pte_unmap_unlock(orig_pte, ptl);
519	return addr != end;
520}
521
522static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
523		pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
524				    void *private)
525{
526#ifdef CONFIG_HUGETLB_PAGE
527	int nid;
528	struct page *page;
529	spinlock_t *ptl;
530	pte_t entry;
531
532	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
533	entry = huge_ptep_get((pte_t *)pmd);
534	if (!pte_present(entry))
535		goto unlock;
536	page = pte_page(entry);
537	nid = page_to_nid(page);
538	if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
539		goto unlock;
540	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
541	if (flags & (MPOL_MF_MOVE_ALL) ||
542	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
543		isolate_huge_page(page, private);
544unlock:
545	spin_unlock(ptl);
546#else
547	BUG();
548#endif
549}
550
551static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
552		unsigned long addr, unsigned long end,
553		const nodemask_t *nodes, unsigned long flags,
554		void *private)
555{
556	pmd_t *pmd;
557	unsigned long next;
558
559	pmd = pmd_offset(pud, addr);
560	do {
561		next = pmd_addr_end(addr, end);
562		if (!pmd_present(*pmd))
563			continue;
564		if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
565			queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
566						flags, private);
567			continue;
568		}
569		split_huge_page_pmd(vma, addr, pmd);
570		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
571			continue;
572		if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
573				    flags, private))
574			return -EIO;
575	} while (pmd++, addr = next, addr != end);
576	return 0;
577}
578
579static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
580		unsigned long addr, unsigned long end,
581		const nodemask_t *nodes, unsigned long flags,
582		void *private)
583{
584	pud_t *pud;
585	unsigned long next;
586
587	pud = pud_offset(pgd, addr);
588	do {
589		next = pud_addr_end(addr, end);
590		if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
591			continue;
592		if (pud_none_or_clear_bad(pud))
593			continue;
594		if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
595				    flags, private))
596			return -EIO;
597	} while (pud++, addr = next, addr != end);
598	return 0;
599}
600
601static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
602		unsigned long addr, unsigned long end,
603		const nodemask_t *nodes, unsigned long flags,
604		void *private)
605{
606	pgd_t *pgd;
607	unsigned long next;
608
609	pgd = pgd_offset(vma->vm_mm, addr);
610	do {
611		next = pgd_addr_end(addr, end);
612		if (pgd_none_or_clear_bad(pgd))
613			continue;
614		if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
615				    flags, private))
616			return -EIO;
617	} while (pgd++, addr = next, addr != end);
618	return 0;
619}
620
621#ifdef CONFIG_NUMA_BALANCING
622/*
623 * This is used to mark a range of virtual addresses to be inaccessible.
624 * These are later cleared by a NUMA hinting fault. Depending on these
625 * faults, pages may be migrated for better NUMA placement.
626 *
627 * This is assuming that NUMA faults are handled using PROT_NONE. If
628 * an architecture makes a different choice, it will need further
629 * changes to the core.
630 */
631unsigned long change_prot_numa(struct vm_area_struct *vma,
632			unsigned long addr, unsigned long end)
633{
634	int nr_updated;
635
636	nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
637	if (nr_updated)
638		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
639
640	return nr_updated;
641}
642#else
643static unsigned long change_prot_numa(struct vm_area_struct *vma,
644			unsigned long addr, unsigned long end)
645{
646	return 0;
647}
648#endif /* CONFIG_NUMA_BALANCING */
649
650/*
651 * Walk through page tables and collect pages to be migrated.
652 *
653 * If pages found in a given range are on a set of nodes (determined by
654 * @nodes and @flags,) it's isolated and queued to the pagelist which is
655 * passed via @private.)
656 */
657static int
658queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
659		const nodemask_t *nodes, unsigned long flags, void *private)
660{
661	int err = 0;
662	struct vm_area_struct *vma, *prev;
663
664	vma = find_vma(mm, start);
665	if (!vma)
666		return -EFAULT;
667	prev = NULL;
668	for (; vma && vma->vm_start < end; vma = vma->vm_next) {
669		unsigned long endvma = vma->vm_end;
670
671		if (endvma > end)
672			endvma = end;
673		if (vma->vm_start > start)
674			start = vma->vm_start;
675
676		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
677			if (!vma->vm_next && vma->vm_end < end)
678				return -EFAULT;
679			if (prev && prev->vm_end < vma->vm_start)
680				return -EFAULT;
681		}
682
683		if (flags & MPOL_MF_LAZY) {
684			/* Similar to task_numa_work, skip inaccessible VMAs */
685			if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
686				change_prot_numa(vma, start, endvma);
687			goto next;
688		}
689
690		if ((flags & MPOL_MF_STRICT) ||
691		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
692		      vma_migratable(vma))) {
693
694			err = queue_pages_pgd_range(vma, start, endvma, nodes,
695						flags, private);
696			if (err)
697				break;
698		}
699next:
700		prev = vma;
701	}
702	return err;
703}
704
705/*
706 * Apply policy to a single VMA
707 * This must be called with the mmap_sem held for writing.
708 */
709static int vma_replace_policy(struct vm_area_struct *vma,
710						struct mempolicy *pol)
711{
712	int err;
713	struct mempolicy *old;
714	struct mempolicy *new;
715
716	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
717		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
718		 vma->vm_ops, vma->vm_file,
719		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
720
721	new = mpol_dup(pol);
722	if (IS_ERR(new))
723		return PTR_ERR(new);
724
725	if (vma->vm_ops && vma->vm_ops->set_policy) {
726		err = vma->vm_ops->set_policy(vma, new);
727		if (err)
728			goto err_out;
729	}
730
731	old = vma->vm_policy;
732	vma->vm_policy = new; /* protected by mmap_sem */
733	mpol_put(old);
734
735	return 0;
736 err_out:
737	mpol_put(new);
738	return err;
739}
740
741/* Step 2: apply policy to a range and do splits. */
742static int mbind_range(struct mm_struct *mm, unsigned long start,
743		       unsigned long end, struct mempolicy *new_pol)
744{
745	struct vm_area_struct *next;
746	struct vm_area_struct *prev;
747	struct vm_area_struct *vma;
748	int err = 0;
749	pgoff_t pgoff;
750	unsigned long vmstart;
751	unsigned long vmend;
752
753	vma = find_vma(mm, start);
754	if (!vma || vma->vm_start > start)
755		return -EFAULT;
756
757	prev = vma->vm_prev;
758	if (start > vma->vm_start)
759		prev = vma;
760
761	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
762		next = vma->vm_next;
763		vmstart = max(start, vma->vm_start);
764		vmend   = min(end, vma->vm_end);
765
766		if (mpol_equal(vma_policy(vma), new_pol))
767			continue;
768
769		pgoff = vma->vm_pgoff +
770			((vmstart - vma->vm_start) >> PAGE_SHIFT);
771		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
772				  vma->anon_vma, vma->vm_file, pgoff,
773				  new_pol, vma_get_anon_name(vma));
774		if (prev) {
775			vma = prev;
776			next = vma->vm_next;
777			if (mpol_equal(vma_policy(vma), new_pol))
778				continue;
779			/* vma_merge() joined vma && vma->next, case 8 */
780			goto replace;
781		}
782		if (vma->vm_start != vmstart) {
783			err = split_vma(vma->vm_mm, vma, vmstart, 1);
784			if (err)
785				goto out;
786		}
787		if (vma->vm_end != vmend) {
788			err = split_vma(vma->vm_mm, vma, vmend, 0);
789			if (err)
790				goto out;
791		}
792 replace:
793		err = vma_replace_policy(vma, new_pol);
794		if (err)
795			goto out;
796	}
797
798 out:
799	return err;
800}
801
802/* Set the process memory policy */
803static long do_set_mempolicy(unsigned short mode, unsigned short flags,
804			     nodemask_t *nodes)
805{
806	struct mempolicy *new, *old;
807	NODEMASK_SCRATCH(scratch);
808	int ret;
809
810	if (!scratch)
811		return -ENOMEM;
812
813	new = mpol_new(mode, flags, nodes);
814	if (IS_ERR(new)) {
815		ret = PTR_ERR(new);
816		goto out;
817	}
818
819	task_lock(current);
820	ret = mpol_set_nodemask(new, nodes, scratch);
821	if (ret) {
822		task_unlock(current);
823		mpol_put(new);
824		goto out;
825	}
826	old = current->mempolicy;
827	current->mempolicy = new;
828	if (new && new->mode == MPOL_INTERLEAVE &&
829	    nodes_weight(new->v.nodes))
830		current->il_next = first_node(new->v.nodes);
831	task_unlock(current);
832	mpol_put(old);
833	ret = 0;
834out:
835	NODEMASK_SCRATCH_FREE(scratch);
836	return ret;
837}
838
839/*
840 * Return nodemask for policy for get_mempolicy() query
841 *
842 * Called with task's alloc_lock held
843 */
844static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
845{
846	nodes_clear(*nodes);
847	if (p == &default_policy)
848		return;
849
850	switch (p->mode) {
851	case MPOL_BIND:
852		/* Fall through */
853	case MPOL_INTERLEAVE:
854		*nodes = p->v.nodes;
855		break;
856	case MPOL_PREFERRED:
857		if (!(p->flags & MPOL_F_LOCAL))
858			node_set(p->v.preferred_node, *nodes);
859		/* else return empty node mask for local allocation */
860		break;
861	default:
862		BUG();
863	}
864}
865
866static int lookup_node(struct mm_struct *mm, unsigned long addr)
867{
868	struct page *p;
869	int err;
870
871	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
872	if (err >= 0) {
873		err = page_to_nid(p);
874		put_page(p);
875	}
876	return err;
877}
878
879/* Retrieve NUMA policy */
880static long do_get_mempolicy(int *policy, nodemask_t *nmask,
881			     unsigned long addr, unsigned long flags)
882{
883	int err;
884	struct mm_struct *mm = current->mm;
885	struct vm_area_struct *vma = NULL;
886	struct mempolicy *pol = current->mempolicy;
887
888	if (flags &
889		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
890		return -EINVAL;
891
892	if (flags & MPOL_F_MEMS_ALLOWED) {
893		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
894			return -EINVAL;
895		*policy = 0;	/* just so it's initialized */
896		task_lock(current);
897		*nmask  = cpuset_current_mems_allowed;
898		task_unlock(current);
899		return 0;
900	}
901
902	if (flags & MPOL_F_ADDR) {
903		/*
904		 * Do NOT fall back to task policy if the
905		 * vma/shared policy at addr is NULL.  We
906		 * want to return MPOL_DEFAULT in this case.
907		 */
908		down_read(&mm->mmap_sem);
909		vma = find_vma_intersection(mm, addr, addr+1);
910		if (!vma) {
911			up_read(&mm->mmap_sem);
912			return -EFAULT;
913		}
914		if (vma->vm_ops && vma->vm_ops->get_policy)
915			pol = vma->vm_ops->get_policy(vma, addr);
916		else
917			pol = vma->vm_policy;
918	} else if (addr)
919		return -EINVAL;
920
921	if (!pol)
922		pol = &default_policy;	/* indicates default behavior */
923
924	if (flags & MPOL_F_NODE) {
925		if (flags & MPOL_F_ADDR) {
926			err = lookup_node(mm, addr);
927			if (err < 0)
928				goto out;
929			*policy = err;
930		} else if (pol == current->mempolicy &&
931				pol->mode == MPOL_INTERLEAVE) {
932			*policy = current->il_next;
933		} else {
934			err = -EINVAL;
935			goto out;
936		}
937	} else {
938		*policy = pol == &default_policy ? MPOL_DEFAULT :
939						pol->mode;
940		/*
941		 * Internal mempolicy flags must be masked off before exposing
942		 * the policy to userspace.
943		 */
944		*policy |= (pol->flags & MPOL_MODE_FLAGS);
945	}
946
947	if (vma) {
948		up_read(&current->mm->mmap_sem);
949		vma = NULL;
950	}
951
952	err = 0;
953	if (nmask) {
954		if (mpol_store_user_nodemask(pol)) {
955			*nmask = pol->w.user_nodemask;
956		} else {
957			task_lock(current);
958			get_policy_nodemask(pol, nmask);
959			task_unlock(current);
960		}
961	}
962
963 out:
964	mpol_cond_put(pol);
965	if (vma)
966		up_read(&current->mm->mmap_sem);
967	return err;
968}
969
970#ifdef CONFIG_MIGRATION
971/*
972 * page migration
973 */
974static void migrate_page_add(struct page *page, struct list_head *pagelist,
975				unsigned long flags)
976{
977	/*
978	 * Avoid migrating a page that is shared with others.
979	 */
980	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
981		if (!isolate_lru_page(page)) {
982			list_add_tail(&page->lru, pagelist);
983			inc_zone_page_state(page, NR_ISOLATED_ANON +
984					    page_is_file_cache(page));
985		}
986	}
987}
988
989static struct page *new_node_page(struct page *page, unsigned long node, int **x)
990{
991	if (PageHuge(page))
992		return alloc_huge_page_node(page_hstate(compound_head(page)),
993					node);
994	else
995		return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
996}
997
998/*
999 * Migrate pages from one node to a target node.
1000 * Returns error or the number of pages not migrated.
1001 */
1002static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1003			   int flags)
1004{
1005	nodemask_t nmask;
1006	LIST_HEAD(pagelist);
1007	int err = 0;
1008
1009	nodes_clear(nmask);
1010	node_set(source, nmask);
1011
1012	/*
1013	 * This does not "check" the range but isolates all pages that
1014	 * need migration.  Between passing in the full user address
1015	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1016	 */
1017	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1018	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1019			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1020
1021	if (!list_empty(&pagelist)) {
1022		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1023					MIGRATE_SYNC, MR_SYSCALL);
1024		if (err)
1025			putback_movable_pages(&pagelist);
1026	}
1027
1028	return err;
1029}
1030
1031/*
1032 * Move pages between the two nodesets so as to preserve the physical
1033 * layout as much as possible.
1034 *
1035 * Returns the number of page that could not be moved.
1036 */
1037int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1038		     const nodemask_t *to, int flags)
1039{
1040	int busy = 0;
1041	int err;
1042	nodemask_t tmp;
1043
1044	err = migrate_prep();
1045	if (err)
1046		return err;
1047
1048	down_read(&mm->mmap_sem);
1049
1050	err = migrate_vmas(mm, from, to, flags);
1051	if (err)
1052		goto out;
1053
1054	/*
1055	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1056	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1057	 * bit in 'tmp', and return that <source, dest> pair for migration.
1058	 * The pair of nodemasks 'to' and 'from' define the map.
1059	 *
1060	 * If no pair of bits is found that way, fallback to picking some
1061	 * pair of 'source' and 'dest' bits that are not the same.  If the
1062	 * 'source' and 'dest' bits are the same, this represents a node
1063	 * that will be migrating to itself, so no pages need move.
1064	 *
1065	 * If no bits are left in 'tmp', or if all remaining bits left
1066	 * in 'tmp' correspond to the same bit in 'to', return false
1067	 * (nothing left to migrate).
1068	 *
1069	 * This lets us pick a pair of nodes to migrate between, such that
1070	 * if possible the dest node is not already occupied by some other
1071	 * source node, minimizing the risk of overloading the memory on a
1072	 * node that would happen if we migrated incoming memory to a node
1073	 * before migrating outgoing memory source that same node.
1074	 *
1075	 * A single scan of tmp is sufficient.  As we go, we remember the
1076	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1077	 * that not only moved, but what's better, moved to an empty slot
1078	 * (d is not set in tmp), then we break out then, with that pair.
1079	 * Otherwise when we finish scanning from_tmp, we at least have the
1080	 * most recent <s, d> pair that moved.  If we get all the way through
1081	 * the scan of tmp without finding any node that moved, much less
1082	 * moved to an empty node, then there is nothing left worth migrating.
1083	 */
1084
1085	tmp = *from;
1086	while (!nodes_empty(tmp)) {
1087		int s,d;
1088		int source = NUMA_NO_NODE;
1089		int dest = 0;
1090
1091		for_each_node_mask(s, tmp) {
1092
1093			/*
1094			 * do_migrate_pages() tries to maintain the relative
1095			 * node relationship of the pages established between
1096			 * threads and memory areas.
1097                         *
1098			 * However if the number of source nodes is not equal to
1099			 * the number of destination nodes we can not preserve
1100			 * this node relative relationship.  In that case, skip
1101			 * copying memory from a node that is in the destination
1102			 * mask.
1103			 *
1104			 * Example: [2,3,4] -> [3,4,5] moves everything.
1105			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1106			 */
1107
1108			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1109						(node_isset(s, *to)))
1110				continue;
1111
1112			d = node_remap(s, *from, *to);
1113			if (s == d)
1114				continue;
1115
1116			source = s;	/* Node moved. Memorize */
1117			dest = d;
1118
1119			/* dest not in remaining from nodes? */
1120			if (!node_isset(dest, tmp))
1121				break;
1122		}
1123		if (source == NUMA_NO_NODE)
1124			break;
1125
1126		node_clear(source, tmp);
1127		err = migrate_to_node(mm, source, dest, flags);
1128		if (err > 0)
1129			busy += err;
1130		if (err < 0)
1131			break;
1132	}
1133out:
1134	up_read(&mm->mmap_sem);
1135	if (err < 0)
1136		return err;
1137	return busy;
1138
1139}
1140
1141/*
1142 * Allocate a new page for page migration based on vma policy.
1143 * Start by assuming the page is mapped by the same vma as contains @start.
1144 * Search forward from there, if not.  N.B., this assumes that the
1145 * list of pages handed to migrate_pages()--which is how we get here--
1146 * is in virtual address order.
1147 */
1148static struct page *new_page(struct page *page, unsigned long start, int **x)
1149{
1150	struct vm_area_struct *vma;
1151	unsigned long uninitialized_var(address);
1152
1153	vma = find_vma(current->mm, start);
1154	while (vma) {
1155		address = page_address_in_vma(page, vma);
1156		if (address != -EFAULT)
1157			break;
1158		vma = vma->vm_next;
1159	}
1160
1161	if (PageHuge(page)) {
1162		BUG_ON(!vma);
1163		return alloc_huge_page_noerr(vma, address, 1);
1164	}
1165	/*
1166	 * if !vma, alloc_page_vma() will use task or system default policy
1167	 */
1168	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1169}
1170#else
1171
1172static void migrate_page_add(struct page *page, struct list_head *pagelist,
1173				unsigned long flags)
1174{
1175}
1176
1177int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1178		     const nodemask_t *to, int flags)
1179{
1180	return -ENOSYS;
1181}
1182
1183static struct page *new_page(struct page *page, unsigned long start, int **x)
1184{
1185	return NULL;
1186}
1187#endif
1188
1189static long do_mbind(unsigned long start, unsigned long len,
1190		     unsigned short mode, unsigned short mode_flags,
1191		     nodemask_t *nmask, unsigned long flags)
1192{
1193	struct mm_struct *mm = current->mm;
1194	struct mempolicy *new;
1195	unsigned long end;
1196	int err;
1197	LIST_HEAD(pagelist);
1198
1199	if (flags & ~(unsigned long)MPOL_MF_VALID)
1200		return -EINVAL;
1201	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1202		return -EPERM;
1203
1204	if (start & ~PAGE_MASK)
1205		return -EINVAL;
1206
1207	if (mode == MPOL_DEFAULT)
1208		flags &= ~MPOL_MF_STRICT;
1209
1210	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1211	end = start + len;
1212
1213	if (end < start)
1214		return -EINVAL;
1215	if (end == start)
1216		return 0;
1217
1218	new = mpol_new(mode, mode_flags, nmask);
1219	if (IS_ERR(new))
1220		return PTR_ERR(new);
1221
1222	if (flags & MPOL_MF_LAZY)
1223		new->flags |= MPOL_F_MOF;
1224
1225	/*
1226	 * If we are using the default policy then operation
1227	 * on discontinuous address spaces is okay after all
1228	 */
1229	if (!new)
1230		flags |= MPOL_MF_DISCONTIG_OK;
1231
1232	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1233		 start, start + len, mode, mode_flags,
1234		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1235
1236	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1237
1238		err = migrate_prep();
1239		if (err)
1240			goto mpol_out;
1241	}
1242	{
1243		NODEMASK_SCRATCH(scratch);
1244		if (scratch) {
1245			down_write(&mm->mmap_sem);
1246			task_lock(current);
1247			err = mpol_set_nodemask(new, nmask, scratch);
1248			task_unlock(current);
1249			if (err)
1250				up_write(&mm->mmap_sem);
1251		} else
1252			err = -ENOMEM;
1253		NODEMASK_SCRATCH_FREE(scratch);
1254	}
1255	if (err)
1256		goto mpol_out;
1257
1258	err = queue_pages_range(mm, start, end, nmask,
1259			  flags | MPOL_MF_INVERT, &pagelist);
1260	if (!err)
1261		err = mbind_range(mm, start, end, new);
1262
1263	if (!err) {
1264		int nr_failed = 0;
1265
1266		if (!list_empty(&pagelist)) {
1267			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1268			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1269				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1270			if (nr_failed)
1271				putback_movable_pages(&pagelist);
1272		}
1273
1274		if (nr_failed && (flags & MPOL_MF_STRICT))
1275			err = -EIO;
1276	} else
1277		putback_movable_pages(&pagelist);
1278
1279	up_write(&mm->mmap_sem);
1280 mpol_out:
1281	mpol_put(new);
1282	return err;
1283}
1284
1285/*
1286 * User space interface with variable sized bitmaps for nodelists.
1287 */
1288
1289/* Copy a node mask from user space. */
1290static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1291		     unsigned long maxnode)
1292{
1293	unsigned long k;
1294	unsigned long nlongs;
1295	unsigned long endmask;
1296
1297	--maxnode;
1298	nodes_clear(*nodes);
1299	if (maxnode == 0 || !nmask)
1300		return 0;
1301	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1302		return -EINVAL;
1303
1304	nlongs = BITS_TO_LONGS(maxnode);
1305	if ((maxnode % BITS_PER_LONG) == 0)
1306		endmask = ~0UL;
1307	else
1308		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1309
1310	/* When the user specified more nodes than supported just check
1311	   if the non supported part is all zero. */
1312	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1313		if (nlongs > PAGE_SIZE/sizeof(long))
1314			return -EINVAL;
1315		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1316			unsigned long t;
1317			if (get_user(t, nmask + k))
1318				return -EFAULT;
1319			if (k == nlongs - 1) {
1320				if (t & endmask)
1321					return -EINVAL;
1322			} else if (t)
1323				return -EINVAL;
1324		}
1325		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1326		endmask = ~0UL;
1327	}
1328
1329	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1330		return -EFAULT;
1331	nodes_addr(*nodes)[nlongs-1] &= endmask;
1332	return 0;
1333}
1334
1335/* Copy a kernel node mask to user space */
1336static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1337			      nodemask_t *nodes)
1338{
1339	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1340	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1341
1342	if (copy > nbytes) {
1343		if (copy > PAGE_SIZE)
1344			return -EINVAL;
1345		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1346			return -EFAULT;
1347		copy = nbytes;
1348	}
1349	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1350}
1351
1352SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1353		unsigned long, mode, const unsigned long __user *, nmask,
1354		unsigned long, maxnode, unsigned, flags)
1355{
1356	nodemask_t nodes;
1357	int err;
1358	unsigned short mode_flags;
1359
1360	mode_flags = mode & MPOL_MODE_FLAGS;
1361	mode &= ~MPOL_MODE_FLAGS;
1362	if (mode >= MPOL_MAX)
1363		return -EINVAL;
1364	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1365	    (mode_flags & MPOL_F_RELATIVE_NODES))
1366		return -EINVAL;
1367	err = get_nodes(&nodes, nmask, maxnode);
1368	if (err)
1369		return err;
1370	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1371}
1372
1373/* Set the process memory policy */
1374SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1375		unsigned long, maxnode)
1376{
1377	int err;
1378	nodemask_t nodes;
1379	unsigned short flags;
1380
1381	flags = mode & MPOL_MODE_FLAGS;
1382	mode &= ~MPOL_MODE_FLAGS;
1383	if ((unsigned int)mode >= MPOL_MAX)
1384		return -EINVAL;
1385	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1386		return -EINVAL;
1387	err = get_nodes(&nodes, nmask, maxnode);
1388	if (err)
1389		return err;
1390	return do_set_mempolicy(mode, flags, &nodes);
1391}
1392
1393SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1394		const unsigned long __user *, old_nodes,
1395		const unsigned long __user *, new_nodes)
1396{
1397	const struct cred *cred = current_cred(), *tcred;
1398	struct mm_struct *mm = NULL;
1399	struct task_struct *task;
1400	nodemask_t task_nodes;
1401	int err;
1402	nodemask_t *old;
1403	nodemask_t *new;
1404	NODEMASK_SCRATCH(scratch);
1405
1406	if (!scratch)
1407		return -ENOMEM;
1408
1409	old = &scratch->mask1;
1410	new = &scratch->mask2;
1411
1412	err = get_nodes(old, old_nodes, maxnode);
1413	if (err)
1414		goto out;
1415
1416	err = get_nodes(new, new_nodes, maxnode);
1417	if (err)
1418		goto out;
1419
1420	/* Find the mm_struct */
1421	rcu_read_lock();
1422	task = pid ? find_task_by_vpid(pid) : current;
1423	if (!task) {
1424		rcu_read_unlock();
1425		err = -ESRCH;
1426		goto out;
1427	}
1428	get_task_struct(task);
1429
1430	err = -EINVAL;
1431
1432	/*
1433	 * Check if this process has the right to modify the specified
1434	 * process. The right exists if the process has administrative
1435	 * capabilities, superuser privileges or the same
1436	 * userid as the target process.
1437	 */
1438	tcred = __task_cred(task);
1439	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1440	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1441	    !capable(CAP_SYS_NICE)) {
1442		rcu_read_unlock();
1443		err = -EPERM;
1444		goto out_put;
1445	}
1446	rcu_read_unlock();
1447
1448	task_nodes = cpuset_mems_allowed(task);
1449	/* Is the user allowed to access the target nodes? */
1450	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1451		err = -EPERM;
1452		goto out_put;
1453	}
1454
1455	if (!nodes_subset(*new, node_states[N_MEMORY])) {
1456		err = -EINVAL;
1457		goto out_put;
1458	}
1459
1460	err = security_task_movememory(task);
1461	if (err)
1462		goto out_put;
1463
1464	mm = get_task_mm(task);
1465	put_task_struct(task);
1466
1467	if (!mm) {
1468		err = -EINVAL;
1469		goto out;
1470	}
1471
1472	err = do_migrate_pages(mm, old, new,
1473		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1474
1475	mmput(mm);
1476out:
1477	NODEMASK_SCRATCH_FREE(scratch);
1478
1479	return err;
1480
1481out_put:
1482	put_task_struct(task);
1483	goto out;
1484
1485}
1486
1487
1488/* Retrieve NUMA policy */
1489SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1490		unsigned long __user *, nmask, unsigned long, maxnode,
1491		unsigned long, addr, unsigned long, flags)
1492{
1493	int err;
1494	int uninitialized_var(pval);
1495	nodemask_t nodes;
1496
1497	if (nmask != NULL && maxnode < MAX_NUMNODES)
1498		return -EINVAL;
1499
1500	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1501
1502	if (err)
1503		return err;
1504
1505	if (policy && put_user(pval, policy))
1506		return -EFAULT;
1507
1508	if (nmask)
1509		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1510
1511	return err;
1512}
1513
1514#ifdef CONFIG_COMPAT
1515
1516COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1517		       compat_ulong_t __user *, nmask,
1518		       compat_ulong_t, maxnode,
1519		       compat_ulong_t, addr, compat_ulong_t, flags)
1520{
1521	long err;
1522	unsigned long __user *nm = NULL;
1523	unsigned long nr_bits, alloc_size;
1524	DECLARE_BITMAP(bm, MAX_NUMNODES);
1525
1526	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1527	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1528
1529	if (nmask)
1530		nm = compat_alloc_user_space(alloc_size);
1531
1532	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1533
1534	if (!err && nmask) {
1535		unsigned long copy_size;
1536		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1537		err = copy_from_user(bm, nm, copy_size);
1538		/* ensure entire bitmap is zeroed */
1539		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1540		err |= compat_put_bitmap(nmask, bm, nr_bits);
1541	}
1542
1543	return err;
1544}
1545
1546COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1547		       compat_ulong_t, maxnode)
1548{
1549	long err = 0;
1550	unsigned long __user *nm = NULL;
1551	unsigned long nr_bits, alloc_size;
1552	DECLARE_BITMAP(bm, MAX_NUMNODES);
1553
1554	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1555	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1556
1557	if (nmask) {
1558		err = compat_get_bitmap(bm, nmask, nr_bits);
1559		nm = compat_alloc_user_space(alloc_size);
1560		err |= copy_to_user(nm, bm, alloc_size);
1561	}
1562
1563	if (err)
1564		return -EFAULT;
1565
1566	return sys_set_mempolicy(mode, nm, nr_bits+1);
1567}
1568
1569COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1570		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1571		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1572{
1573	long err = 0;
1574	unsigned long __user *nm = NULL;
1575	unsigned long nr_bits, alloc_size;
1576	nodemask_t bm;
1577
1578	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1579	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1580
1581	if (nmask) {
1582		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1583		nm = compat_alloc_user_space(alloc_size);
1584		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1585	}
1586
1587	if (err)
1588		return -EFAULT;
1589
1590	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1591}
1592
1593#endif
1594
1595struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1596						unsigned long addr)
1597{
1598	struct mempolicy *pol = NULL;
1599
1600	if (vma) {
1601		if (vma->vm_ops && vma->vm_ops->get_policy) {
1602			pol = vma->vm_ops->get_policy(vma, addr);
1603		} else if (vma->vm_policy) {
1604			pol = vma->vm_policy;
1605
1606			/*
1607			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1608			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1609			 * count on these policies which will be dropped by
1610			 * mpol_cond_put() later
1611			 */
1612			if (mpol_needs_cond_ref(pol))
1613				mpol_get(pol);
1614		}
1615	}
1616
1617	return pol;
1618}
1619
1620/*
1621 * get_vma_policy(@vma, @addr)
1622 * @vma: virtual memory area whose policy is sought
1623 * @addr: address in @vma for shared policy lookup
1624 *
1625 * Returns effective policy for a VMA at specified address.
1626 * Falls back to current->mempolicy or system default policy, as necessary.
1627 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1628 * count--added by the get_policy() vm_op, as appropriate--to protect against
1629 * freeing by another task.  It is the caller's responsibility to free the
1630 * extra reference for shared policies.
1631 */
1632static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1633						unsigned long addr)
1634{
1635	struct mempolicy *pol = __get_vma_policy(vma, addr);
1636
1637	if (!pol)
1638		pol = get_task_policy(current);
1639
1640	return pol;
1641}
1642
1643bool vma_policy_mof(struct vm_area_struct *vma)
1644{
1645	struct mempolicy *pol;
1646
1647	if (vma->vm_ops && vma->vm_ops->get_policy) {
1648		bool ret = false;
1649
1650		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1651		if (pol && (pol->flags & MPOL_F_MOF))
1652			ret = true;
1653		mpol_cond_put(pol);
1654
1655		return ret;
1656	}
1657
1658	pol = vma->vm_policy;
1659	if (!pol)
1660		pol = get_task_policy(current);
1661
1662	return pol->flags & MPOL_F_MOF;
1663}
1664
1665static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1666{
1667	enum zone_type dynamic_policy_zone = policy_zone;
1668
1669	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1670
1671	/*
1672	 * if policy->v.nodes has movable memory only,
1673	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1674	 *
1675	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1676	 * so if the following test faile, it implies
1677	 * policy->v.nodes has movable memory only.
1678	 */
1679	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1680		dynamic_policy_zone = ZONE_MOVABLE;
1681
1682	return zone >= dynamic_policy_zone;
1683}
1684
1685/*
1686 * Return a nodemask representing a mempolicy for filtering nodes for
1687 * page allocation
1688 */
1689static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1690{
1691	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1692	if (unlikely(policy->mode == MPOL_BIND) &&
1693			apply_policy_zone(policy, gfp_zone(gfp)) &&
1694			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1695		return &policy->v.nodes;
1696
1697	return NULL;
1698}
1699
1700/* Return a zonelist indicated by gfp for node representing a mempolicy */
1701static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1702	int nd)
1703{
1704	switch (policy->mode) {
1705	case MPOL_PREFERRED:
1706		if (!(policy->flags & MPOL_F_LOCAL))
1707			nd = policy->v.preferred_node;
1708		break;
1709	case MPOL_BIND:
1710		/*
1711		 * Normally, MPOL_BIND allocations are node-local within the
1712		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1713		 * current node isn't part of the mask, we use the zonelist for
1714		 * the first node in the mask instead.
1715		 */
1716		if (unlikely(gfp & __GFP_THISNODE) &&
1717				unlikely(!node_isset(nd, policy->v.nodes)))
1718			nd = first_node(policy->v.nodes);
1719		break;
1720	default:
1721		BUG();
1722	}
1723	return node_zonelist(nd, gfp);
1724}
1725
1726/* Do dynamic interleaving for a process */
1727static unsigned interleave_nodes(struct mempolicy *policy)
1728{
1729	unsigned nid, next;
1730	struct task_struct *me = current;
1731
1732	nid = me->il_next;
1733	next = next_node(nid, policy->v.nodes);
1734	if (next >= MAX_NUMNODES)
1735		next = first_node(policy->v.nodes);
1736	if (next < MAX_NUMNODES)
1737		me->il_next = next;
1738	return nid;
1739}
1740
1741/*
1742 * Depending on the memory policy provide a node from which to allocate the
1743 * next slab entry.
1744 */
1745unsigned int mempolicy_slab_node(void)
1746{
1747	struct mempolicy *policy;
1748	int node = numa_mem_id();
1749
1750	if (in_interrupt())
1751		return node;
1752
1753	policy = current->mempolicy;
1754	if (!policy || policy->flags & MPOL_F_LOCAL)
1755		return node;
1756
1757	switch (policy->mode) {
1758	case MPOL_PREFERRED:
1759		/*
1760		 * handled MPOL_F_LOCAL above
1761		 */
1762		return policy->v.preferred_node;
1763
1764	case MPOL_INTERLEAVE:
1765		return interleave_nodes(policy);
1766
1767	case MPOL_BIND: {
1768		/*
1769		 * Follow bind policy behavior and start allocation at the
1770		 * first node.
1771		 */
1772		struct zonelist *zonelist;
1773		struct zone *zone;
1774		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1775		zonelist = &NODE_DATA(node)->node_zonelists[0];
1776		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1777							&policy->v.nodes,
1778							&zone);
1779		return zone ? zone->node : node;
1780	}
1781
1782	default:
1783		BUG();
1784	}
1785}
1786
1787/* Do static interleaving for a VMA with known offset. */
1788static unsigned offset_il_node(struct mempolicy *pol,
1789		struct vm_area_struct *vma, unsigned long off)
1790{
1791	unsigned nnodes = nodes_weight(pol->v.nodes);
1792	unsigned target;
1793	int c;
1794	int nid = NUMA_NO_NODE;
1795
1796	if (!nnodes)
1797		return numa_node_id();
1798	target = (unsigned int)off % nnodes;
1799	c = 0;
1800	do {
1801		nid = next_node(nid, pol->v.nodes);
1802		c++;
1803	} while (c <= target);
1804	return nid;
1805}
1806
1807/* Determine a node number for interleave */
1808static inline unsigned interleave_nid(struct mempolicy *pol,
1809		 struct vm_area_struct *vma, unsigned long addr, int shift)
1810{
1811	if (vma) {
1812		unsigned long off;
1813
1814		/*
1815		 * for small pages, there is no difference between
1816		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1817		 * for huge pages, since vm_pgoff is in units of small
1818		 * pages, we need to shift off the always 0 bits to get
1819		 * a useful offset.
1820		 */
1821		BUG_ON(shift < PAGE_SHIFT);
1822		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1823		off += (addr - vma->vm_start) >> shift;
1824		return offset_il_node(pol, vma, off);
1825	} else
1826		return interleave_nodes(pol);
1827}
1828
1829/*
1830 * Return the bit number of a random bit set in the nodemask.
1831 * (returns NUMA_NO_NODE if nodemask is empty)
1832 */
1833int node_random(const nodemask_t *maskp)
1834{
1835	int w, bit = NUMA_NO_NODE;
1836
1837	w = nodes_weight(*maskp);
1838	if (w)
1839		bit = bitmap_ord_to_pos(maskp->bits,
1840			get_random_int() % w, MAX_NUMNODES);
1841	return bit;
1842}
1843
1844#ifdef CONFIG_HUGETLBFS
1845/*
1846 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1847 * @vma: virtual memory area whose policy is sought
1848 * @addr: address in @vma for shared policy lookup and interleave policy
1849 * @gfp_flags: for requested zone
1850 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1851 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1852 *
1853 * Returns a zonelist suitable for a huge page allocation and a pointer
1854 * to the struct mempolicy for conditional unref after allocation.
1855 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1856 * @nodemask for filtering the zonelist.
1857 *
1858 * Must be protected by read_mems_allowed_begin()
1859 */
1860struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1861				gfp_t gfp_flags, struct mempolicy **mpol,
1862				nodemask_t **nodemask)
1863{
1864	struct zonelist *zl;
1865
1866	*mpol = get_vma_policy(vma, addr);
1867	*nodemask = NULL;	/* assume !MPOL_BIND */
1868
1869	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1870		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1871				huge_page_shift(hstate_vma(vma))), gfp_flags);
1872	} else {
1873		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1874		if ((*mpol)->mode == MPOL_BIND)
1875			*nodemask = &(*mpol)->v.nodes;
1876	}
1877	return zl;
1878}
1879
1880/*
1881 * init_nodemask_of_mempolicy
1882 *
1883 * If the current task's mempolicy is "default" [NULL], return 'false'
1884 * to indicate default policy.  Otherwise, extract the policy nodemask
1885 * for 'bind' or 'interleave' policy into the argument nodemask, or
1886 * initialize the argument nodemask to contain the single node for
1887 * 'preferred' or 'local' policy and return 'true' to indicate presence
1888 * of non-default mempolicy.
1889 *
1890 * We don't bother with reference counting the mempolicy [mpol_get/put]
1891 * because the current task is examining it's own mempolicy and a task's
1892 * mempolicy is only ever changed by the task itself.
1893 *
1894 * N.B., it is the caller's responsibility to free a returned nodemask.
1895 */
1896bool init_nodemask_of_mempolicy(nodemask_t *mask)
1897{
1898	struct mempolicy *mempolicy;
1899	int nid;
1900
1901	if (!(mask && current->mempolicy))
1902		return false;
1903
1904	task_lock(current);
1905	mempolicy = current->mempolicy;
1906	switch (mempolicy->mode) {
1907	case MPOL_PREFERRED:
1908		if (mempolicy->flags & MPOL_F_LOCAL)
1909			nid = numa_node_id();
1910		else
1911			nid = mempolicy->v.preferred_node;
1912		init_nodemask_of_node(mask, nid);
1913		break;
1914
1915	case MPOL_BIND:
1916		/* Fall through */
1917	case MPOL_INTERLEAVE:
1918		*mask =  mempolicy->v.nodes;
1919		break;
1920
1921	default:
1922		BUG();
1923	}
1924	task_unlock(current);
1925
1926	return true;
1927}
1928#endif
1929
1930/*
1931 * mempolicy_nodemask_intersects
1932 *
1933 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1934 * policy.  Otherwise, check for intersection between mask and the policy
1935 * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1936 * policy, always return true since it may allocate elsewhere on fallback.
1937 *
1938 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1939 */
1940bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1941					const nodemask_t *mask)
1942{
1943	struct mempolicy *mempolicy;
1944	bool ret = true;
1945
1946	if (!mask)
1947		return ret;
1948	task_lock(tsk);
1949	mempolicy = tsk->mempolicy;
1950	if (!mempolicy)
1951		goto out;
1952
1953	switch (mempolicy->mode) {
1954	case MPOL_PREFERRED:
1955		/*
1956		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1957		 * allocate from, they may fallback to other nodes when oom.
1958		 * Thus, it's possible for tsk to have allocated memory from
1959		 * nodes in mask.
1960		 */
1961		break;
1962	case MPOL_BIND:
1963	case MPOL_INTERLEAVE:
1964		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1965		break;
1966	default:
1967		BUG();
1968	}
1969out:
1970	task_unlock(tsk);
1971	return ret;
1972}
1973
1974/* Allocate a page in interleaved policy.
1975   Own path because it needs to do special accounting. */
1976static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1977					unsigned nid)
1978{
1979	struct zonelist *zl;
1980	struct page *page;
1981
1982	zl = node_zonelist(nid, gfp);
1983	page = __alloc_pages(gfp, order, zl);
1984	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1985		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1986	return page;
1987}
1988
1989/**
1990 * 	alloc_pages_vma	- Allocate a page for a VMA.
1991 *
1992 * 	@gfp:
1993 *      %GFP_USER    user allocation.
1994 *      %GFP_KERNEL  kernel allocations,
1995 *      %GFP_HIGHMEM highmem/user allocations,
1996 *      %GFP_FS      allocation should not call back into a file system.
1997 *      %GFP_ATOMIC  don't sleep.
1998 *
1999 *	@order:Order of the GFP allocation.
2000 * 	@vma:  Pointer to VMA or NULL if not available.
2001 *	@addr: Virtual Address of the allocation. Must be inside the VMA.
2002 *
2003 * 	This function allocates a page from the kernel page pool and applies
2004 *	a NUMA policy associated with the VMA or the current process.
2005 *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
2006 *	mm_struct of the VMA to prevent it from going away. Should be used for
2007 *	all allocations for pages that will be mapped into
2008 * 	user space. Returns NULL when no page can be allocated.
2009 *
2010 *	Should be called with the mm_sem of the vma hold.
2011 */
2012struct page *
2013alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2014		unsigned long addr, int node)
2015{
2016	struct mempolicy *pol;
2017	struct page *page;
2018	unsigned int cpuset_mems_cookie;
2019
2020retry_cpuset:
2021	pol = get_vma_policy(vma, addr);
2022	cpuset_mems_cookie = read_mems_allowed_begin();
2023
2024	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
2025		unsigned nid;
2026
2027		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2028		mpol_cond_put(pol);
2029		page = alloc_page_interleave(gfp, order, nid);
2030		if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2031			goto retry_cpuset;
2032
2033		return page;
2034	}
2035	page = __alloc_pages_nodemask(gfp, order,
2036				      policy_zonelist(gfp, pol, node),
2037				      policy_nodemask(gfp, pol));
2038	mpol_cond_put(pol);
2039	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2040		goto retry_cpuset;
2041	return page;
2042}
2043
2044/**
2045 * 	alloc_pages_current - Allocate pages.
2046 *
2047 *	@gfp:
2048 *		%GFP_USER   user allocation,
2049 *      	%GFP_KERNEL kernel allocation,
2050 *      	%GFP_HIGHMEM highmem allocation,
2051 *      	%GFP_FS     don't call back into a file system.
2052 *      	%GFP_ATOMIC don't sleep.
2053 *	@order: Power of two of allocation size in pages. 0 is a single page.
2054 *
2055 *	Allocate a page from the kernel page pool.  When not in
2056 *	interrupt context and apply the current process NUMA policy.
2057 *	Returns NULL when no page can be allocated.
2058 *
2059 *	Don't call cpuset_update_task_memory_state() unless
2060 *	1) it's ok to take cpuset_sem (can WAIT), and
2061 *	2) allocating for current task (not interrupt).
2062 */
2063struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2064{
2065	struct mempolicy *pol = &default_policy;
2066	struct page *page;
2067	unsigned int cpuset_mems_cookie;
2068
2069	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2070		pol = get_task_policy(current);
2071
2072retry_cpuset:
2073	cpuset_mems_cookie = read_mems_allowed_begin();
2074
2075	/*
2076	 * No reference counting needed for current->mempolicy
2077	 * nor system default_policy
2078	 */
2079	if (pol->mode == MPOL_INTERLEAVE)
2080		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2081	else
2082		page = __alloc_pages_nodemask(gfp, order,
2083				policy_zonelist(gfp, pol, numa_node_id()),
2084				policy_nodemask(gfp, pol));
2085
2086	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2087		goto retry_cpuset;
2088
2089	return page;
2090}
2091EXPORT_SYMBOL(alloc_pages_current);
2092
2093int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2094{
2095	struct mempolicy *pol = mpol_dup(vma_policy(src));
2096
2097	if (IS_ERR(pol))
2098		return PTR_ERR(pol);
2099	dst->vm_policy = pol;
2100	return 0;
2101}
2102
2103/*
2104 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2105 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2106 * with the mems_allowed returned by cpuset_mems_allowed().  This
2107 * keeps mempolicies cpuset relative after its cpuset moves.  See
2108 * further kernel/cpuset.c update_nodemask().
2109 *
2110 * current's mempolicy may be rebinded by the other task(the task that changes
2111 * cpuset's mems), so we needn't do rebind work for current task.
2112 */
2113
2114/* Slow path of a mempolicy duplicate */
2115struct mempolicy *__mpol_dup(struct mempolicy *old)
2116{
2117	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2118
2119	if (!new)
2120		return ERR_PTR(-ENOMEM);
2121
2122	/* task's mempolicy is protected by alloc_lock */
2123	if (old == current->mempolicy) {
2124		task_lock(current);
2125		*new = *old;
2126		task_unlock(current);
2127	} else
2128		*new = *old;
2129
2130	if (current_cpuset_is_being_rebound()) {
2131		nodemask_t mems = cpuset_mems_allowed(current);
2132		if (new->flags & MPOL_F_REBINDING)
2133			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2134		else
2135			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2136	}
2137	atomic_set(&new->refcnt, 1);
2138	return new;
2139}
2140
2141/* Slow path of a mempolicy comparison */
2142bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2143{
2144	if (!a || !b)
2145		return false;
2146	if (a->mode != b->mode)
2147		return false;
2148	if (a->flags != b->flags)
2149		return false;
2150	if (mpol_store_user_nodemask(a))
2151		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2152			return false;
2153
2154	switch (a->mode) {
2155	case MPOL_BIND:
2156		/* Fall through */
2157	case MPOL_INTERLEAVE:
2158		return !!nodes_equal(a->v.nodes, b->v.nodes);
2159	case MPOL_PREFERRED:
2160		return a->v.preferred_node == b->v.preferred_node;
2161	default:
2162		BUG();
2163		return false;
2164	}
2165}
2166
2167/*
2168 * Shared memory backing store policy support.
2169 *
2170 * Remember policies even when nobody has shared memory mapped.
2171 * The policies are kept in Red-Black tree linked from the inode.
2172 * They are protected by the sp->lock spinlock, which should be held
2173 * for any accesses to the tree.
2174 */
2175
2176/* lookup first element intersecting start-end */
2177/* Caller holds sp->lock */
2178static struct sp_node *
2179sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2180{
2181	struct rb_node *n = sp->root.rb_node;
2182
2183	while (n) {
2184		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2185
2186		if (start >= p->end)
2187			n = n->rb_right;
2188		else if (end <= p->start)
2189			n = n->rb_left;
2190		else
2191			break;
2192	}
2193	if (!n)
2194		return NULL;
2195	for (;;) {
2196		struct sp_node *w = NULL;
2197		struct rb_node *prev = rb_prev(n);
2198		if (!prev)
2199			break;
2200		w = rb_entry(prev, struct sp_node, nd);
2201		if (w->end <= start)
2202			break;
2203		n = prev;
2204	}
2205	return rb_entry(n, struct sp_node, nd);
2206}
2207
2208/* Insert a new shared policy into the list. */
2209/* Caller holds sp->lock */
2210static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2211{
2212	struct rb_node **p = &sp->root.rb_node;
2213	struct rb_node *parent = NULL;
2214	struct sp_node *nd;
2215
2216	while (*p) {
2217		parent = *p;
2218		nd = rb_entry(parent, struct sp_node, nd);
2219		if (new->start < nd->start)
2220			p = &(*p)->rb_left;
2221		else if (new->end > nd->end)
2222			p = &(*p)->rb_right;
2223		else
2224			BUG();
2225	}
2226	rb_link_node(&new->nd, parent, p);
2227	rb_insert_color(&new->nd, &sp->root);
2228	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2229		 new->policy ? new->policy->mode : 0);
2230}
2231
2232/* Find shared policy intersecting idx */
2233struct mempolicy *
2234mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2235{
2236	struct mempolicy *pol = NULL;
2237	struct sp_node *sn;
2238
2239	if (!sp->root.rb_node)
2240		return NULL;
2241	spin_lock(&sp->lock);
2242	sn = sp_lookup(sp, idx, idx+1);
2243	if (sn) {
2244		mpol_get(sn->policy);
2245		pol = sn->policy;
2246	}
2247	spin_unlock(&sp->lock);
2248	return pol;
2249}
2250
2251static void sp_free(struct sp_node *n)
2252{
2253	mpol_put(n->policy);
2254	kmem_cache_free(sn_cache, n);
2255}
2256
2257/**
2258 * mpol_misplaced - check whether current page node is valid in policy
2259 *
2260 * @page: page to be checked
2261 * @vma: vm area where page mapped
2262 * @addr: virtual address where page mapped
2263 *
2264 * Lookup current policy node id for vma,addr and "compare to" page's
2265 * node id.
2266 *
2267 * Returns:
2268 *	-1	- not misplaced, page is in the right node
2269 *	node	- node id where the page should be
2270 *
2271 * Policy determination "mimics" alloc_page_vma().
2272 * Called from fault path where we know the vma and faulting address.
2273 */
2274int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2275{
2276	struct mempolicy *pol;
2277	struct zone *zone;
2278	int curnid = page_to_nid(page);
2279	unsigned long pgoff;
2280	int thiscpu = raw_smp_processor_id();
2281	int thisnid = cpu_to_node(thiscpu);
2282	int polnid = -1;
2283	int ret = -1;
2284
2285	BUG_ON(!vma);
2286
2287	pol = get_vma_policy(vma, addr);
2288	if (!(pol->flags & MPOL_F_MOF))
2289		goto out;
2290
2291	switch (pol->mode) {
2292	case MPOL_INTERLEAVE:
2293		BUG_ON(addr >= vma->vm_end);
2294		BUG_ON(addr < vma->vm_start);
2295
2296		pgoff = vma->vm_pgoff;
2297		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2298		polnid = offset_il_node(pol, vma, pgoff);
2299		break;
2300
2301	case MPOL_PREFERRED:
2302		if (pol->flags & MPOL_F_LOCAL)
2303			polnid = numa_node_id();
2304		else
2305			polnid = pol->v.preferred_node;
2306		break;
2307
2308	case MPOL_BIND:
2309		/*
2310		 * allows binding to multiple nodes.
2311		 * use current page if in policy nodemask,
2312		 * else select nearest allowed node, if any.
2313		 * If no allowed nodes, use current [!misplaced].
2314		 */
2315		if (node_isset(curnid, pol->v.nodes))
2316			goto out;
2317		(void)first_zones_zonelist(
2318				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2319				gfp_zone(GFP_HIGHUSER),
2320				&pol->v.nodes, &zone);
2321		polnid = zone->node;
2322		break;
2323
2324	default:
2325		BUG();
2326	}
2327
2328	/* Migrate the page towards the node whose CPU is referencing it */
2329	if (pol->flags & MPOL_F_MORON) {
2330		polnid = thisnid;
2331
2332		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2333			goto out;
2334	}
2335
2336	if (curnid != polnid)
2337		ret = polnid;
2338out:
2339	mpol_cond_put(pol);
2340
2341	return ret;
2342}
2343
2344static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2345{
2346	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2347	rb_erase(&n->nd, &sp->root);
2348	sp_free(n);
2349}
2350
2351static void sp_node_init(struct sp_node *node, unsigned long start,
2352			unsigned long end, struct mempolicy *pol)
2353{
2354	node->start = start;
2355	node->end = end;
2356	node->policy = pol;
2357}
2358
2359static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2360				struct mempolicy *pol)
2361{
2362	struct sp_node *n;
2363	struct mempolicy *newpol;
2364
2365	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2366	if (!n)
2367		return NULL;
2368
2369	newpol = mpol_dup(pol);
2370	if (IS_ERR(newpol)) {
2371		kmem_cache_free(sn_cache, n);
2372		return NULL;
2373	}
2374	newpol->flags |= MPOL_F_SHARED;
2375	sp_node_init(n, start, end, newpol);
2376
2377	return n;
2378}
2379
2380/* Replace a policy range. */
2381static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2382				 unsigned long end, struct sp_node *new)
2383{
2384	struct sp_node *n;
2385	struct sp_node *n_new = NULL;
2386	struct mempolicy *mpol_new = NULL;
2387	int ret = 0;
2388
2389restart:
2390	spin_lock(&sp->lock);
2391	n = sp_lookup(sp, start, end);
2392	/* Take care of old policies in the same range. */
2393	while (n && n->start < end) {
2394		struct rb_node *next = rb_next(&n->nd);
2395		if (n->start >= start) {
2396			if (n->end <= end)
2397				sp_delete(sp, n);
2398			else
2399				n->start = end;
2400		} else {
2401			/* Old policy spanning whole new range. */
2402			if (n->end > end) {
2403				if (!n_new)
2404					goto alloc_new;
2405
2406				*mpol_new = *n->policy;
2407				atomic_set(&mpol_new->refcnt, 1);
2408				sp_node_init(n_new, end, n->end, mpol_new);
2409				n->end = start;
2410				sp_insert(sp, n_new);
2411				n_new = NULL;
2412				mpol_new = NULL;
2413				break;
2414			} else
2415				n->end = start;
2416		}
2417		if (!next)
2418			break;
2419		n = rb_entry(next, struct sp_node, nd);
2420	}
2421	if (new)
2422		sp_insert(sp, new);
2423	spin_unlock(&sp->lock);
2424	ret = 0;
2425
2426err_out:
2427	if (mpol_new)
2428		mpol_put(mpol_new);
2429	if (n_new)
2430		kmem_cache_free(sn_cache, n_new);
2431
2432	return ret;
2433
2434alloc_new:
2435	spin_unlock(&sp->lock);
2436	ret = -ENOMEM;
2437	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2438	if (!n_new)
2439		goto err_out;
2440	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2441	if (!mpol_new)
2442		goto err_out;
2443	goto restart;
2444}
2445
2446/**
2447 * mpol_shared_policy_init - initialize shared policy for inode
2448 * @sp: pointer to inode shared policy
2449 * @mpol:  struct mempolicy to install
2450 *
2451 * Install non-NULL @mpol in inode's shared policy rb-tree.
2452 * On entry, the current task has a reference on a non-NULL @mpol.
2453 * This must be released on exit.
2454 * This is called at get_inode() calls and we can use GFP_KERNEL.
2455 */
2456void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2457{
2458	int ret;
2459
2460	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2461	spin_lock_init(&sp->lock);
2462
2463	if (mpol) {
2464		struct vm_area_struct pvma;
2465		struct mempolicy *new;
2466		NODEMASK_SCRATCH(scratch);
2467
2468		if (!scratch)
2469			goto put_mpol;
2470		/* contextualize the tmpfs mount point mempolicy */
2471		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2472		if (IS_ERR(new))
2473			goto free_scratch; /* no valid nodemask intersection */
2474
2475		task_lock(current);
2476		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2477		task_unlock(current);
2478		if (ret)
2479			goto put_new;
2480
2481		/* Create pseudo-vma that contains just the policy */
2482		memset(&pvma, 0, sizeof(struct vm_area_struct));
2483		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2484		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2485
2486put_new:
2487		mpol_put(new);			/* drop initial ref */
2488free_scratch:
2489		NODEMASK_SCRATCH_FREE(scratch);
2490put_mpol:
2491		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2492	}
2493}
2494
2495int mpol_set_shared_policy(struct shared_policy *info,
2496			struct vm_area_struct *vma, struct mempolicy *npol)
2497{
2498	int err;
2499	struct sp_node *new = NULL;
2500	unsigned long sz = vma_pages(vma);
2501
2502	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2503		 vma->vm_pgoff,
2504		 sz, npol ? npol->mode : -1,
2505		 npol ? npol->flags : -1,
2506		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2507
2508	if (npol) {
2509		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2510		if (!new)
2511			return -ENOMEM;
2512	}
2513	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2514	if (err && new)
2515		sp_free(new);
2516	return err;
2517}
2518
2519/* Free a backing policy store on inode delete. */
2520void mpol_free_shared_policy(struct shared_policy *p)
2521{
2522	struct sp_node *n;
2523	struct rb_node *next;
2524
2525	if (!p->root.rb_node)
2526		return;
2527	spin_lock(&p->lock);
2528	next = rb_first(&p->root);
2529	while (next) {
2530		n = rb_entry(next, struct sp_node, nd);
2531		next = rb_next(&n->nd);
2532		sp_delete(p, n);
2533	}
2534	spin_unlock(&p->lock);
2535}
2536
2537#ifdef CONFIG_NUMA_BALANCING
2538static int __initdata numabalancing_override;
2539
2540static void __init check_numabalancing_enable(void)
2541{
2542	bool numabalancing_default = false;
2543
2544	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2545		numabalancing_default = true;
2546
2547	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2548	if (numabalancing_override)
2549		set_numabalancing_state(numabalancing_override == 1);
2550
2551	if (nr_node_ids > 1 && !numabalancing_override) {
2552		pr_info("%s automatic NUMA balancing. "
2553			"Configure with numa_balancing= or the "
2554			"kernel.numa_balancing sysctl",
2555			numabalancing_default ? "Enabling" : "Disabling");
2556		set_numabalancing_state(numabalancing_default);
2557	}
2558}
2559
2560static int __init setup_numabalancing(char *str)
2561{
2562	int ret = 0;
2563	if (!str)
2564		goto out;
2565
2566	if (!strcmp(str, "enable")) {
2567		numabalancing_override = 1;
2568		ret = 1;
2569	} else if (!strcmp(str, "disable")) {
2570		numabalancing_override = -1;
2571		ret = 1;
2572	}
2573out:
2574	if (!ret)
2575		pr_warn("Unable to parse numa_balancing=\n");
2576
2577	return ret;
2578}
2579__setup("numa_balancing=", setup_numabalancing);
2580#else
2581static inline void __init check_numabalancing_enable(void)
2582{
2583}
2584#endif /* CONFIG_NUMA_BALANCING */
2585
2586/* assumes fs == KERNEL_DS */
2587void __init numa_policy_init(void)
2588{
2589	nodemask_t interleave_nodes;
2590	unsigned long largest = 0;
2591	int nid, prefer = 0;
2592
2593	policy_cache = kmem_cache_create("numa_policy",
2594					 sizeof(struct mempolicy),
2595					 0, SLAB_PANIC, NULL);
2596
2597	sn_cache = kmem_cache_create("shared_policy_node",
2598				     sizeof(struct sp_node),
2599				     0, SLAB_PANIC, NULL);
2600
2601	for_each_node(nid) {
2602		preferred_node_policy[nid] = (struct mempolicy) {
2603			.refcnt = ATOMIC_INIT(1),
2604			.mode = MPOL_PREFERRED,
2605			.flags = MPOL_F_MOF | MPOL_F_MORON,
2606			.v = { .preferred_node = nid, },
2607		};
2608	}
2609
2610	/*
2611	 * Set interleaving policy for system init. Interleaving is only
2612	 * enabled across suitably sized nodes (default is >= 16MB), or
2613	 * fall back to the largest node if they're all smaller.
2614	 */
2615	nodes_clear(interleave_nodes);
2616	for_each_node_state(nid, N_MEMORY) {
2617		unsigned long total_pages = node_present_pages(nid);
2618
2619		/* Preserve the largest node */
2620		if (largest < total_pages) {
2621			largest = total_pages;
2622			prefer = nid;
2623		}
2624
2625		/* Interleave this node? */
2626		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2627			node_set(nid, interleave_nodes);
2628	}
2629
2630	/* All too small, use the largest */
2631	if (unlikely(nodes_empty(interleave_nodes)))
2632		node_set(prefer, interleave_nodes);
2633
2634	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2635		pr_err("%s: interleaving failed\n", __func__);
2636
2637	check_numabalancing_enable();
2638}
2639
2640/* Reset policy of current process to default */
2641void numa_default_policy(void)
2642{
2643	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2644}
2645
2646/*
2647 * Parse and format mempolicy from/to strings
2648 */
2649
2650/*
2651 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2652 */
2653static const char * const policy_modes[] =
2654{
2655	[MPOL_DEFAULT]    = "default",
2656	[MPOL_PREFERRED]  = "prefer",
2657	[MPOL_BIND]       = "bind",
2658	[MPOL_INTERLEAVE] = "interleave",
2659	[MPOL_LOCAL]      = "local",
2660};
2661
2662
2663#ifdef CONFIG_TMPFS
2664/**
2665 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2666 * @str:  string containing mempolicy to parse
2667 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2668 *
2669 * Format of input:
2670 *	<mode>[=<flags>][:<nodelist>]
2671 *
2672 * On success, returns 0, else 1
2673 */
2674int mpol_parse_str(char *str, struct mempolicy **mpol)
2675{
2676	struct mempolicy *new = NULL;
2677	unsigned short mode;
2678	unsigned short mode_flags;
2679	nodemask_t nodes;
2680	char *nodelist = strchr(str, ':');
2681	char *flags = strchr(str, '=');
2682	int err = 1;
2683
2684	if (nodelist) {
2685		/* NUL-terminate mode or flags string */
2686		*nodelist++ = '\0';
2687		if (nodelist_parse(nodelist, nodes))
2688			goto out;
2689		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2690			goto out;
2691	} else
2692		nodes_clear(nodes);
2693
2694	if (flags)
2695		*flags++ = '\0';	/* terminate mode string */
2696
2697	for (mode = 0; mode < MPOL_MAX; mode++) {
2698		if (!strcmp(str, policy_modes[mode])) {
2699			break;
2700		}
2701	}
2702	if (mode >= MPOL_MAX)
2703		goto out;
2704
2705	switch (mode) {
2706	case MPOL_PREFERRED:
2707		/*
2708		 * Insist on a nodelist of one node only
2709		 */
2710		if (nodelist) {
2711			char *rest = nodelist;
2712			while (isdigit(*rest))
2713				rest++;
2714			if (*rest)
2715				goto out;
2716		}
2717		break;
2718	case MPOL_INTERLEAVE:
2719		/*
2720		 * Default to online nodes with memory if no nodelist
2721		 */
2722		if (!nodelist)
2723			nodes = node_states[N_MEMORY];
2724		break;
2725	case MPOL_LOCAL:
2726		/*
2727		 * Don't allow a nodelist;  mpol_new() checks flags
2728		 */
2729		if (nodelist)
2730			goto out;
2731		mode = MPOL_PREFERRED;
2732		break;
2733	case MPOL_DEFAULT:
2734		/*
2735		 * Insist on a empty nodelist
2736		 */
2737		if (!nodelist)
2738			err = 0;
2739		goto out;
2740	case MPOL_BIND:
2741		/*
2742		 * Insist on a nodelist
2743		 */
2744		if (!nodelist)
2745			goto out;
2746	}
2747
2748	mode_flags = 0;
2749	if (flags) {
2750		/*
2751		 * Currently, we only support two mutually exclusive
2752		 * mode flags.
2753		 */
2754		if (!strcmp(flags, "static"))
2755			mode_flags |= MPOL_F_STATIC_NODES;
2756		else if (!strcmp(flags, "relative"))
2757			mode_flags |= MPOL_F_RELATIVE_NODES;
2758		else
2759			goto out;
2760	}
2761
2762	new = mpol_new(mode, mode_flags, &nodes);
2763	if (IS_ERR(new))
2764		goto out;
2765
2766	/*
2767	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2768	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2769	 */
2770	if (mode != MPOL_PREFERRED)
2771		new->v.nodes = nodes;
2772	else if (nodelist)
2773		new->v.preferred_node = first_node(nodes);
2774	else
2775		new->flags |= MPOL_F_LOCAL;
2776
2777	/*
2778	 * Save nodes for contextualization: this will be used to "clone"
2779	 * the mempolicy in a specific context [cpuset] at a later time.
2780	 */
2781	new->w.user_nodemask = nodes;
2782
2783	err = 0;
2784
2785out:
2786	/* Restore string for error message */
2787	if (nodelist)
2788		*--nodelist = ':';
2789	if (flags)
2790		*--flags = '=';
2791	if (!err)
2792		*mpol = new;
2793	return err;
2794}
2795#endif /* CONFIG_TMPFS */
2796
2797/**
2798 * mpol_to_str - format a mempolicy structure for printing
2799 * @buffer:  to contain formatted mempolicy string
2800 * @maxlen:  length of @buffer
2801 * @pol:  pointer to mempolicy to be formatted
2802 *
2803 * Convert @pol into a string.  If @buffer is too short, truncate the string.
2804 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2805 * longest flag, "relative", and to display at least a few node ids.
2806 */
2807void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2808{
2809	char *p = buffer;
2810	nodemask_t nodes = NODE_MASK_NONE;
2811	unsigned short mode = MPOL_DEFAULT;
2812	unsigned short flags = 0;
2813
2814	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2815		mode = pol->mode;
2816		flags = pol->flags;
2817	}
2818
2819	switch (mode) {
2820	case MPOL_DEFAULT:
2821		break;
2822	case MPOL_PREFERRED:
2823		if (flags & MPOL_F_LOCAL)
2824			mode = MPOL_LOCAL;
2825		else
2826			node_set(pol->v.preferred_node, nodes);
2827		break;
2828	case MPOL_BIND:
2829	case MPOL_INTERLEAVE:
2830		nodes = pol->v.nodes;
2831		break;
2832	default:
2833		WARN_ON_ONCE(1);
2834		snprintf(p, maxlen, "unknown");
2835		return;
2836	}
2837
2838	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2839
2840	if (flags & MPOL_MODE_FLAGS) {
2841		p += snprintf(p, buffer + maxlen - p, "=");
2842
2843		/*
2844		 * Currently, the only defined flags are mutually exclusive
2845		 */
2846		if (flags & MPOL_F_STATIC_NODES)
2847			p += snprintf(p, buffer + maxlen - p, "static");
2848		else if (flags & MPOL_F_RELATIVE_NODES)
2849			p += snprintf(p, buffer + maxlen - p, "relative");
2850	}
2851
2852	if (!nodes_empty(nodes)) {
2853		p += snprintf(p, buffer + maxlen - p, ":");
2854	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2855	}
2856}
2857