mempolicy.c revision 76aac0e9a17742e60d408be1a706e9aaad370891
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave     Allocate memory interleaved over a set of nodes,
16 *                with normal fallback if it fails.
17 *                For VMA based allocations this interleaves based on the
18 *                offset into the backing object or offset into the mapping
19 *                for anonymous memory. For process policy an process counter
20 *                is used.
21 *
22 * bind           Only allocate memory on a specific set of nodes,
23 *                no fallback.
24 *                FIXME: memory is allocated starting with the first node
25 *                to the last. It would be better if bind would truly restrict
26 *                the allocation to memory nodes instead
27 *
28 * preferred       Try a specific node first before normal fallback.
29 *                As a special case node -1 here means do the allocation
30 *                on the local CPU. This is normally identical to default,
31 *                but useful to set in a VMA when you have a non default
32 *                process policy.
33 *
34 * default        Allocate on the local node first, or when on a VMA
35 *                use the process policy. This is what Linux always did
36 *		  in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57   fix mmap readahead to honour policy and enable policy for any page cache
58   object
59   statistics for bigpages
60   global policy for page cache? currently it uses process policy. Requires
61   first item above.
62   handle mremap for shared memory (currently ignored for the policy)
63   grows down?
64   make bind policy root only? It can trigger oom much faster and the
65   kernel is not always grateful with that.
66*/
67
68#include <linux/mempolicy.h>
69#include <linux/mm.h>
70#include <linux/highmem.h>
71#include <linux/hugetlb.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
74#include <linux/nodemask.h>
75#include <linux/cpuset.h>
76#include <linux/gfp.h>
77#include <linux/slab.h>
78#include <linux/string.h>
79#include <linux/module.h>
80#include <linux/nsproxy.h>
81#include <linux/interrupt.h>
82#include <linux/init.h>
83#include <linux/compat.h>
84#include <linux/swap.h>
85#include <linux/seq_file.h>
86#include <linux/proc_fs.h>
87#include <linux/migrate.h>
88#include <linux/rmap.h>
89#include <linux/security.h>
90#include <linux/syscalls.h>
91#include <linux/ctype.h>
92
93#include <asm/tlbflush.h>
94#include <asm/uaccess.h>
95
96#include "internal.h"
97
98/* Internal flags */
99#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
100#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
101#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
102
103static struct kmem_cache *policy_cache;
104static struct kmem_cache *sn_cache;
105
106/* Highest zone. An specific allocation for a zone below that is not
107   policied. */
108enum zone_type policy_zone = 0;
109
110/*
111 * run-time system-wide default policy => local allocation
112 */
113struct mempolicy default_policy = {
114	.refcnt = ATOMIC_INIT(1), /* never free it */
115	.mode = MPOL_PREFERRED,
116	.flags = MPOL_F_LOCAL,
117};
118
119static const struct mempolicy_operations {
120	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
121	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
122} mpol_ops[MPOL_MAX];
123
124/* Check that the nodemask contains at least one populated zone */
125static int is_valid_nodemask(const nodemask_t *nodemask)
126{
127	int nd, k;
128
129	/* Check that there is something useful in this mask */
130	k = policy_zone;
131
132	for_each_node_mask(nd, *nodemask) {
133		struct zone *z;
134
135		for (k = 0; k <= policy_zone; k++) {
136			z = &NODE_DATA(nd)->node_zones[k];
137			if (z->present_pages > 0)
138				return 1;
139		}
140	}
141
142	return 0;
143}
144
145static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
146{
147	return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
148}
149
150static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
151				   const nodemask_t *rel)
152{
153	nodemask_t tmp;
154	nodes_fold(tmp, *orig, nodes_weight(*rel));
155	nodes_onto(*ret, tmp, *rel);
156}
157
158static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
159{
160	if (nodes_empty(*nodes))
161		return -EINVAL;
162	pol->v.nodes = *nodes;
163	return 0;
164}
165
166static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
167{
168	if (!nodes)
169		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
170	else if (nodes_empty(*nodes))
171		return -EINVAL;			/*  no allowed nodes */
172	else
173		pol->v.preferred_node = first_node(*nodes);
174	return 0;
175}
176
177static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
178{
179	if (!is_valid_nodemask(nodes))
180		return -EINVAL;
181	pol->v.nodes = *nodes;
182	return 0;
183}
184
185/* Create a new policy */
186static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
187				  nodemask_t *nodes)
188{
189	struct mempolicy *policy;
190	nodemask_t cpuset_context_nmask;
191	int ret;
192
193	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
194		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
195
196	if (mode == MPOL_DEFAULT) {
197		if (nodes && !nodes_empty(*nodes))
198			return ERR_PTR(-EINVAL);
199		return NULL;	/* simply delete any existing policy */
200	}
201	VM_BUG_ON(!nodes);
202
203	/*
204	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
205	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
206	 * All other modes require a valid pointer to a non-empty nodemask.
207	 */
208	if (mode == MPOL_PREFERRED) {
209		if (nodes_empty(*nodes)) {
210			if (((flags & MPOL_F_STATIC_NODES) ||
211			     (flags & MPOL_F_RELATIVE_NODES)))
212				return ERR_PTR(-EINVAL);
213			nodes = NULL;	/* flag local alloc */
214		}
215	} else if (nodes_empty(*nodes))
216		return ERR_PTR(-EINVAL);
217	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
218	if (!policy)
219		return ERR_PTR(-ENOMEM);
220	atomic_set(&policy->refcnt, 1);
221	policy->mode = mode;
222	policy->flags = flags;
223
224	if (nodes) {
225		/*
226		 * cpuset related setup doesn't apply to local allocation
227		 */
228		cpuset_update_task_memory_state();
229		if (flags & MPOL_F_RELATIVE_NODES)
230			mpol_relative_nodemask(&cpuset_context_nmask, nodes,
231					       &cpuset_current_mems_allowed);
232		else
233			nodes_and(cpuset_context_nmask, *nodes,
234				  cpuset_current_mems_allowed);
235		if (mpol_store_user_nodemask(policy))
236			policy->w.user_nodemask = *nodes;
237		else
238			policy->w.cpuset_mems_allowed =
239						cpuset_mems_allowed(current);
240	}
241
242	ret = mpol_ops[mode].create(policy,
243				nodes ? &cpuset_context_nmask : NULL);
244	if (ret < 0) {
245		kmem_cache_free(policy_cache, policy);
246		return ERR_PTR(ret);
247	}
248	return policy;
249}
250
251/* Slow path of a mpol destructor. */
252void __mpol_put(struct mempolicy *p)
253{
254	if (!atomic_dec_and_test(&p->refcnt))
255		return;
256	kmem_cache_free(policy_cache, p);
257}
258
259static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
260{
261}
262
263static void mpol_rebind_nodemask(struct mempolicy *pol,
264				 const nodemask_t *nodes)
265{
266	nodemask_t tmp;
267
268	if (pol->flags & MPOL_F_STATIC_NODES)
269		nodes_and(tmp, pol->w.user_nodemask, *nodes);
270	else if (pol->flags & MPOL_F_RELATIVE_NODES)
271		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
272	else {
273		nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
274			    *nodes);
275		pol->w.cpuset_mems_allowed = *nodes;
276	}
277
278	pol->v.nodes = tmp;
279	if (!node_isset(current->il_next, tmp)) {
280		current->il_next = next_node(current->il_next, tmp);
281		if (current->il_next >= MAX_NUMNODES)
282			current->il_next = first_node(tmp);
283		if (current->il_next >= MAX_NUMNODES)
284			current->il_next = numa_node_id();
285	}
286}
287
288static void mpol_rebind_preferred(struct mempolicy *pol,
289				  const nodemask_t *nodes)
290{
291	nodemask_t tmp;
292
293	if (pol->flags & MPOL_F_STATIC_NODES) {
294		int node = first_node(pol->w.user_nodemask);
295
296		if (node_isset(node, *nodes)) {
297			pol->v.preferred_node = node;
298			pol->flags &= ~MPOL_F_LOCAL;
299		} else
300			pol->flags |= MPOL_F_LOCAL;
301	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
302		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
303		pol->v.preferred_node = first_node(tmp);
304	} else if (!(pol->flags & MPOL_F_LOCAL)) {
305		pol->v.preferred_node = node_remap(pol->v.preferred_node,
306						   pol->w.cpuset_mems_allowed,
307						   *nodes);
308		pol->w.cpuset_mems_allowed = *nodes;
309	}
310}
311
312/* Migrate a policy to a different set of nodes */
313static void mpol_rebind_policy(struct mempolicy *pol,
314			       const nodemask_t *newmask)
315{
316	if (!pol)
317		return;
318	if (!mpol_store_user_nodemask(pol) &&
319	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
320		return;
321	mpol_ops[pol->mode].rebind(pol, newmask);
322}
323
324/*
325 * Wrapper for mpol_rebind_policy() that just requires task
326 * pointer, and updates task mempolicy.
327 */
328
329void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
330{
331	mpol_rebind_policy(tsk->mempolicy, new);
332}
333
334/*
335 * Rebind each vma in mm to new nodemask.
336 *
337 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
338 */
339
340void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
341{
342	struct vm_area_struct *vma;
343
344	down_write(&mm->mmap_sem);
345	for (vma = mm->mmap; vma; vma = vma->vm_next)
346		mpol_rebind_policy(vma->vm_policy, new);
347	up_write(&mm->mmap_sem);
348}
349
350static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
351	[MPOL_DEFAULT] = {
352		.rebind = mpol_rebind_default,
353	},
354	[MPOL_INTERLEAVE] = {
355		.create = mpol_new_interleave,
356		.rebind = mpol_rebind_nodemask,
357	},
358	[MPOL_PREFERRED] = {
359		.create = mpol_new_preferred,
360		.rebind = mpol_rebind_preferred,
361	},
362	[MPOL_BIND] = {
363		.create = mpol_new_bind,
364		.rebind = mpol_rebind_nodemask,
365	},
366};
367
368static void gather_stats(struct page *, void *, int pte_dirty);
369static void migrate_page_add(struct page *page, struct list_head *pagelist,
370				unsigned long flags);
371
372/* Scan through pages checking if pages follow certain conditions. */
373static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
374		unsigned long addr, unsigned long end,
375		const nodemask_t *nodes, unsigned long flags,
376		void *private)
377{
378	pte_t *orig_pte;
379	pte_t *pte;
380	spinlock_t *ptl;
381
382	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
383	do {
384		struct page *page;
385		int nid;
386
387		if (!pte_present(*pte))
388			continue;
389		page = vm_normal_page(vma, addr, *pte);
390		if (!page)
391			continue;
392		/*
393		 * The check for PageReserved here is important to avoid
394		 * handling zero pages and other pages that may have been
395		 * marked special by the system.
396		 *
397		 * If the PageReserved would not be checked here then f.e.
398		 * the location of the zero page could have an influence
399		 * on MPOL_MF_STRICT, zero pages would be counted for
400		 * the per node stats, and there would be useless attempts
401		 * to put zero pages on the migration list.
402		 */
403		if (PageReserved(page))
404			continue;
405		nid = page_to_nid(page);
406		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
407			continue;
408
409		if (flags & MPOL_MF_STATS)
410			gather_stats(page, private, pte_dirty(*pte));
411		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
412			migrate_page_add(page, private, flags);
413		else
414			break;
415	} while (pte++, addr += PAGE_SIZE, addr != end);
416	pte_unmap_unlock(orig_pte, ptl);
417	return addr != end;
418}
419
420static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
421		unsigned long addr, unsigned long end,
422		const nodemask_t *nodes, unsigned long flags,
423		void *private)
424{
425	pmd_t *pmd;
426	unsigned long next;
427
428	pmd = pmd_offset(pud, addr);
429	do {
430		next = pmd_addr_end(addr, end);
431		if (pmd_none_or_clear_bad(pmd))
432			continue;
433		if (check_pte_range(vma, pmd, addr, next, nodes,
434				    flags, private))
435			return -EIO;
436	} while (pmd++, addr = next, addr != end);
437	return 0;
438}
439
440static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
441		unsigned long addr, unsigned long end,
442		const nodemask_t *nodes, unsigned long flags,
443		void *private)
444{
445	pud_t *pud;
446	unsigned long next;
447
448	pud = pud_offset(pgd, addr);
449	do {
450		next = pud_addr_end(addr, end);
451		if (pud_none_or_clear_bad(pud))
452			continue;
453		if (check_pmd_range(vma, pud, addr, next, nodes,
454				    flags, private))
455			return -EIO;
456	} while (pud++, addr = next, addr != end);
457	return 0;
458}
459
460static inline int check_pgd_range(struct vm_area_struct *vma,
461		unsigned long addr, unsigned long end,
462		const nodemask_t *nodes, unsigned long flags,
463		void *private)
464{
465	pgd_t *pgd;
466	unsigned long next;
467
468	pgd = pgd_offset(vma->vm_mm, addr);
469	do {
470		next = pgd_addr_end(addr, end);
471		if (pgd_none_or_clear_bad(pgd))
472			continue;
473		if (check_pud_range(vma, pgd, addr, next, nodes,
474				    flags, private))
475			return -EIO;
476	} while (pgd++, addr = next, addr != end);
477	return 0;
478}
479
480/*
481 * Check if all pages in a range are on a set of nodes.
482 * If pagelist != NULL then isolate pages from the LRU and
483 * put them on the pagelist.
484 */
485static struct vm_area_struct *
486check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
487		const nodemask_t *nodes, unsigned long flags, void *private)
488{
489	int err;
490	struct vm_area_struct *first, *vma, *prev;
491
492	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
493
494		err = migrate_prep();
495		if (err)
496			return ERR_PTR(err);
497	}
498
499	first = find_vma(mm, start);
500	if (!first)
501		return ERR_PTR(-EFAULT);
502	prev = NULL;
503	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
504		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
505			if (!vma->vm_next && vma->vm_end < end)
506				return ERR_PTR(-EFAULT);
507			if (prev && prev->vm_end < vma->vm_start)
508				return ERR_PTR(-EFAULT);
509		}
510		if (!is_vm_hugetlb_page(vma) &&
511		    ((flags & MPOL_MF_STRICT) ||
512		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
513				vma_migratable(vma)))) {
514			unsigned long endvma = vma->vm_end;
515
516			if (endvma > end)
517				endvma = end;
518			if (vma->vm_start > start)
519				start = vma->vm_start;
520			err = check_pgd_range(vma, start, endvma, nodes,
521						flags, private);
522			if (err) {
523				first = ERR_PTR(err);
524				break;
525			}
526		}
527		prev = vma;
528	}
529	return first;
530}
531
532/* Apply policy to a single VMA */
533static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
534{
535	int err = 0;
536	struct mempolicy *old = vma->vm_policy;
537
538	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
539		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
540		 vma->vm_ops, vma->vm_file,
541		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
542
543	if (vma->vm_ops && vma->vm_ops->set_policy)
544		err = vma->vm_ops->set_policy(vma, new);
545	if (!err) {
546		mpol_get(new);
547		vma->vm_policy = new;
548		mpol_put(old);
549	}
550	return err;
551}
552
553/* Step 2: apply policy to a range and do splits. */
554static int mbind_range(struct vm_area_struct *vma, unsigned long start,
555		       unsigned long end, struct mempolicy *new)
556{
557	struct vm_area_struct *next;
558	int err;
559
560	err = 0;
561	for (; vma && vma->vm_start < end; vma = next) {
562		next = vma->vm_next;
563		if (vma->vm_start < start)
564			err = split_vma(vma->vm_mm, vma, start, 1);
565		if (!err && vma->vm_end > end)
566			err = split_vma(vma->vm_mm, vma, end, 0);
567		if (!err)
568			err = policy_vma(vma, new);
569		if (err)
570			break;
571	}
572	return err;
573}
574
575/*
576 * Update task->flags PF_MEMPOLICY bit: set iff non-default
577 * mempolicy.  Allows more rapid checking of this (combined perhaps
578 * with other PF_* flag bits) on memory allocation hot code paths.
579 *
580 * If called from outside this file, the task 'p' should -only- be
581 * a newly forked child not yet visible on the task list, because
582 * manipulating the task flags of a visible task is not safe.
583 *
584 * The above limitation is why this routine has the funny name
585 * mpol_fix_fork_child_flag().
586 *
587 * It is also safe to call this with a task pointer of current,
588 * which the static wrapper mpol_set_task_struct_flag() does,
589 * for use within this file.
590 */
591
592void mpol_fix_fork_child_flag(struct task_struct *p)
593{
594	if (p->mempolicy)
595		p->flags |= PF_MEMPOLICY;
596	else
597		p->flags &= ~PF_MEMPOLICY;
598}
599
600static void mpol_set_task_struct_flag(void)
601{
602	mpol_fix_fork_child_flag(current);
603}
604
605/* Set the process memory policy */
606static long do_set_mempolicy(unsigned short mode, unsigned short flags,
607			     nodemask_t *nodes)
608{
609	struct mempolicy *new;
610	struct mm_struct *mm = current->mm;
611
612	new = mpol_new(mode, flags, nodes);
613	if (IS_ERR(new))
614		return PTR_ERR(new);
615
616	/*
617	 * prevent changing our mempolicy while show_numa_maps()
618	 * is using it.
619	 * Note:  do_set_mempolicy() can be called at init time
620	 * with no 'mm'.
621	 */
622	if (mm)
623		down_write(&mm->mmap_sem);
624	mpol_put(current->mempolicy);
625	current->mempolicy = new;
626	mpol_set_task_struct_flag();
627	if (new && new->mode == MPOL_INTERLEAVE &&
628	    nodes_weight(new->v.nodes))
629		current->il_next = first_node(new->v.nodes);
630	if (mm)
631		up_write(&mm->mmap_sem);
632
633	return 0;
634}
635
636/*
637 * Return nodemask for policy for get_mempolicy() query
638 */
639static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
640{
641	nodes_clear(*nodes);
642	if (p == &default_policy)
643		return;
644
645	switch (p->mode) {
646	case MPOL_BIND:
647		/* Fall through */
648	case MPOL_INTERLEAVE:
649		*nodes = p->v.nodes;
650		break;
651	case MPOL_PREFERRED:
652		if (!(p->flags & MPOL_F_LOCAL))
653			node_set(p->v.preferred_node, *nodes);
654		/* else return empty node mask for local allocation */
655		break;
656	default:
657		BUG();
658	}
659}
660
661static int lookup_node(struct mm_struct *mm, unsigned long addr)
662{
663	struct page *p;
664	int err;
665
666	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
667	if (err >= 0) {
668		err = page_to_nid(p);
669		put_page(p);
670	}
671	return err;
672}
673
674/* Retrieve NUMA policy */
675static long do_get_mempolicy(int *policy, nodemask_t *nmask,
676			     unsigned long addr, unsigned long flags)
677{
678	int err;
679	struct mm_struct *mm = current->mm;
680	struct vm_area_struct *vma = NULL;
681	struct mempolicy *pol = current->mempolicy;
682
683	cpuset_update_task_memory_state();
684	if (flags &
685		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
686		return -EINVAL;
687
688	if (flags & MPOL_F_MEMS_ALLOWED) {
689		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
690			return -EINVAL;
691		*policy = 0;	/* just so it's initialized */
692		*nmask  = cpuset_current_mems_allowed;
693		return 0;
694	}
695
696	if (flags & MPOL_F_ADDR) {
697		/*
698		 * Do NOT fall back to task policy if the
699		 * vma/shared policy at addr is NULL.  We
700		 * want to return MPOL_DEFAULT in this case.
701		 */
702		down_read(&mm->mmap_sem);
703		vma = find_vma_intersection(mm, addr, addr+1);
704		if (!vma) {
705			up_read(&mm->mmap_sem);
706			return -EFAULT;
707		}
708		if (vma->vm_ops && vma->vm_ops->get_policy)
709			pol = vma->vm_ops->get_policy(vma, addr);
710		else
711			pol = vma->vm_policy;
712	} else if (addr)
713		return -EINVAL;
714
715	if (!pol)
716		pol = &default_policy;	/* indicates default behavior */
717
718	if (flags & MPOL_F_NODE) {
719		if (flags & MPOL_F_ADDR) {
720			err = lookup_node(mm, addr);
721			if (err < 0)
722				goto out;
723			*policy = err;
724		} else if (pol == current->mempolicy &&
725				pol->mode == MPOL_INTERLEAVE) {
726			*policy = current->il_next;
727		} else {
728			err = -EINVAL;
729			goto out;
730		}
731	} else {
732		*policy = pol == &default_policy ? MPOL_DEFAULT :
733						pol->mode;
734		/*
735		 * Internal mempolicy flags must be masked off before exposing
736		 * the policy to userspace.
737		 */
738		*policy |= (pol->flags & MPOL_MODE_FLAGS);
739	}
740
741	if (vma) {
742		up_read(&current->mm->mmap_sem);
743		vma = NULL;
744	}
745
746	err = 0;
747	if (nmask)
748		get_policy_nodemask(pol, nmask);
749
750 out:
751	mpol_cond_put(pol);
752	if (vma)
753		up_read(&current->mm->mmap_sem);
754	return err;
755}
756
757#ifdef CONFIG_MIGRATION
758/*
759 * page migration
760 */
761static void migrate_page_add(struct page *page, struct list_head *pagelist,
762				unsigned long flags)
763{
764	/*
765	 * Avoid migrating a page that is shared with others.
766	 */
767	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
768		if (!isolate_lru_page(page)) {
769			list_add_tail(&page->lru, pagelist);
770		}
771	}
772}
773
774static struct page *new_node_page(struct page *page, unsigned long node, int **x)
775{
776	return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
777}
778
779/*
780 * Migrate pages from one node to a target node.
781 * Returns error or the number of pages not migrated.
782 */
783static int migrate_to_node(struct mm_struct *mm, int source, int dest,
784			   int flags)
785{
786	nodemask_t nmask;
787	LIST_HEAD(pagelist);
788	int err = 0;
789
790	nodes_clear(nmask);
791	node_set(source, nmask);
792
793	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
794			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
795
796	if (!list_empty(&pagelist))
797		err = migrate_pages(&pagelist, new_node_page, dest);
798
799	return err;
800}
801
802/*
803 * Move pages between the two nodesets so as to preserve the physical
804 * layout as much as possible.
805 *
806 * Returns the number of page that could not be moved.
807 */
808int do_migrate_pages(struct mm_struct *mm,
809	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
810{
811	int busy = 0;
812	int err = 0;
813	nodemask_t tmp;
814
815	down_read(&mm->mmap_sem);
816
817	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
818	if (err)
819		goto out;
820
821/*
822 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
823 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
824 * bit in 'tmp', and return that <source, dest> pair for migration.
825 * The pair of nodemasks 'to' and 'from' define the map.
826 *
827 * If no pair of bits is found that way, fallback to picking some
828 * pair of 'source' and 'dest' bits that are not the same.  If the
829 * 'source' and 'dest' bits are the same, this represents a node
830 * that will be migrating to itself, so no pages need move.
831 *
832 * If no bits are left in 'tmp', or if all remaining bits left
833 * in 'tmp' correspond to the same bit in 'to', return false
834 * (nothing left to migrate).
835 *
836 * This lets us pick a pair of nodes to migrate between, such that
837 * if possible the dest node is not already occupied by some other
838 * source node, minimizing the risk of overloading the memory on a
839 * node that would happen if we migrated incoming memory to a node
840 * before migrating outgoing memory source that same node.
841 *
842 * A single scan of tmp is sufficient.  As we go, we remember the
843 * most recent <s, d> pair that moved (s != d).  If we find a pair
844 * that not only moved, but what's better, moved to an empty slot
845 * (d is not set in tmp), then we break out then, with that pair.
846 * Otherwise when we finish scannng from_tmp, we at least have the
847 * most recent <s, d> pair that moved.  If we get all the way through
848 * the scan of tmp without finding any node that moved, much less
849 * moved to an empty node, then there is nothing left worth migrating.
850 */
851
852	tmp = *from_nodes;
853	while (!nodes_empty(tmp)) {
854		int s,d;
855		int source = -1;
856		int dest = 0;
857
858		for_each_node_mask(s, tmp) {
859			d = node_remap(s, *from_nodes, *to_nodes);
860			if (s == d)
861				continue;
862
863			source = s;	/* Node moved. Memorize */
864			dest = d;
865
866			/* dest not in remaining from nodes? */
867			if (!node_isset(dest, tmp))
868				break;
869		}
870		if (source == -1)
871			break;
872
873		node_clear(source, tmp);
874		err = migrate_to_node(mm, source, dest, flags);
875		if (err > 0)
876			busy += err;
877		if (err < 0)
878			break;
879	}
880out:
881	up_read(&mm->mmap_sem);
882	if (err < 0)
883		return err;
884	return busy;
885
886}
887
888/*
889 * Allocate a new page for page migration based on vma policy.
890 * Start assuming that page is mapped by vma pointed to by @private.
891 * Search forward from there, if not.  N.B., this assumes that the
892 * list of pages handed to migrate_pages()--which is how we get here--
893 * is in virtual address order.
894 */
895static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
896{
897	struct vm_area_struct *vma = (struct vm_area_struct *)private;
898	unsigned long uninitialized_var(address);
899
900	while (vma) {
901		address = page_address_in_vma(page, vma);
902		if (address != -EFAULT)
903			break;
904		vma = vma->vm_next;
905	}
906
907	/*
908	 * if !vma, alloc_page_vma() will use task or system default policy
909	 */
910	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
911}
912#else
913
914static void migrate_page_add(struct page *page, struct list_head *pagelist,
915				unsigned long flags)
916{
917}
918
919int do_migrate_pages(struct mm_struct *mm,
920	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
921{
922	return -ENOSYS;
923}
924
925static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
926{
927	return NULL;
928}
929#endif
930
931static long do_mbind(unsigned long start, unsigned long len,
932		     unsigned short mode, unsigned short mode_flags,
933		     nodemask_t *nmask, unsigned long flags)
934{
935	struct vm_area_struct *vma;
936	struct mm_struct *mm = current->mm;
937	struct mempolicy *new;
938	unsigned long end;
939	int err;
940	LIST_HEAD(pagelist);
941
942	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
943				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
944		return -EINVAL;
945	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
946		return -EPERM;
947
948	if (start & ~PAGE_MASK)
949		return -EINVAL;
950
951	if (mode == MPOL_DEFAULT)
952		flags &= ~MPOL_MF_STRICT;
953
954	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
955	end = start + len;
956
957	if (end < start)
958		return -EINVAL;
959	if (end == start)
960		return 0;
961
962	new = mpol_new(mode, mode_flags, nmask);
963	if (IS_ERR(new))
964		return PTR_ERR(new);
965
966	/*
967	 * If we are using the default policy then operation
968	 * on discontinuous address spaces is okay after all
969	 */
970	if (!new)
971		flags |= MPOL_MF_DISCONTIG_OK;
972
973	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
974		 start, start + len, mode, mode_flags,
975		 nmask ? nodes_addr(*nmask)[0] : -1);
976
977	down_write(&mm->mmap_sem);
978	vma = check_range(mm, start, end, nmask,
979			  flags | MPOL_MF_INVERT, &pagelist);
980
981	err = PTR_ERR(vma);
982	if (!IS_ERR(vma)) {
983		int nr_failed = 0;
984
985		err = mbind_range(vma, start, end, new);
986
987		if (!list_empty(&pagelist))
988			nr_failed = migrate_pages(&pagelist, new_vma_page,
989						(unsigned long)vma);
990
991		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
992			err = -EIO;
993	}
994
995	up_write(&mm->mmap_sem);
996	mpol_put(new);
997	return err;
998}
999
1000/*
1001 * User space interface with variable sized bitmaps for nodelists.
1002 */
1003
1004/* Copy a node mask from user space. */
1005static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1006		     unsigned long maxnode)
1007{
1008	unsigned long k;
1009	unsigned long nlongs;
1010	unsigned long endmask;
1011
1012	--maxnode;
1013	nodes_clear(*nodes);
1014	if (maxnode == 0 || !nmask)
1015		return 0;
1016	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1017		return -EINVAL;
1018
1019	nlongs = BITS_TO_LONGS(maxnode);
1020	if ((maxnode % BITS_PER_LONG) == 0)
1021		endmask = ~0UL;
1022	else
1023		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1024
1025	/* When the user specified more nodes than supported just check
1026	   if the non supported part is all zero. */
1027	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1028		if (nlongs > PAGE_SIZE/sizeof(long))
1029			return -EINVAL;
1030		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1031			unsigned long t;
1032			if (get_user(t, nmask + k))
1033				return -EFAULT;
1034			if (k == nlongs - 1) {
1035				if (t & endmask)
1036					return -EINVAL;
1037			} else if (t)
1038				return -EINVAL;
1039		}
1040		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1041		endmask = ~0UL;
1042	}
1043
1044	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1045		return -EFAULT;
1046	nodes_addr(*nodes)[nlongs-1] &= endmask;
1047	return 0;
1048}
1049
1050/* Copy a kernel node mask to user space */
1051static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1052			      nodemask_t *nodes)
1053{
1054	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1055	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1056
1057	if (copy > nbytes) {
1058		if (copy > PAGE_SIZE)
1059			return -EINVAL;
1060		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1061			return -EFAULT;
1062		copy = nbytes;
1063	}
1064	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1065}
1066
1067asmlinkage long sys_mbind(unsigned long start, unsigned long len,
1068			unsigned long mode,
1069			unsigned long __user *nmask, unsigned long maxnode,
1070			unsigned flags)
1071{
1072	nodemask_t nodes;
1073	int err;
1074	unsigned short mode_flags;
1075
1076	mode_flags = mode & MPOL_MODE_FLAGS;
1077	mode &= ~MPOL_MODE_FLAGS;
1078	if (mode >= MPOL_MAX)
1079		return -EINVAL;
1080	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1081	    (mode_flags & MPOL_F_RELATIVE_NODES))
1082		return -EINVAL;
1083	err = get_nodes(&nodes, nmask, maxnode);
1084	if (err)
1085		return err;
1086	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1087}
1088
1089/* Set the process memory policy */
1090asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
1091		unsigned long maxnode)
1092{
1093	int err;
1094	nodemask_t nodes;
1095	unsigned short flags;
1096
1097	flags = mode & MPOL_MODE_FLAGS;
1098	mode &= ~MPOL_MODE_FLAGS;
1099	if ((unsigned int)mode >= MPOL_MAX)
1100		return -EINVAL;
1101	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1102		return -EINVAL;
1103	err = get_nodes(&nodes, nmask, maxnode);
1104	if (err)
1105		return err;
1106	return do_set_mempolicy(mode, flags, &nodes);
1107}
1108
1109asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
1110		const unsigned long __user *old_nodes,
1111		const unsigned long __user *new_nodes)
1112{
1113	struct mm_struct *mm;
1114	struct task_struct *task;
1115	nodemask_t old;
1116	nodemask_t new;
1117	nodemask_t task_nodes;
1118	uid_t uid, euid;
1119	int err;
1120
1121	err = get_nodes(&old, old_nodes, maxnode);
1122	if (err)
1123		return err;
1124
1125	err = get_nodes(&new, new_nodes, maxnode);
1126	if (err)
1127		return err;
1128
1129	/* Find the mm_struct */
1130	read_lock(&tasklist_lock);
1131	task = pid ? find_task_by_vpid(pid) : current;
1132	if (!task) {
1133		read_unlock(&tasklist_lock);
1134		return -ESRCH;
1135	}
1136	mm = get_task_mm(task);
1137	read_unlock(&tasklist_lock);
1138
1139	if (!mm)
1140		return -EINVAL;
1141
1142	/*
1143	 * Check if this process has the right to modify the specified
1144	 * process. The right exists if the process has administrative
1145	 * capabilities, superuser privileges or the same
1146	 * userid as the target process.
1147	 */
1148	uid = current_uid();
1149	euid = current_euid();
1150	if (euid != task->suid && euid != task->uid &&
1151	    uid  != task->suid && uid  != task->uid &&
1152	    !capable(CAP_SYS_NICE)) {
1153		err = -EPERM;
1154		goto out;
1155	}
1156
1157	task_nodes = cpuset_mems_allowed(task);
1158	/* Is the user allowed to access the target nodes? */
1159	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1160		err = -EPERM;
1161		goto out;
1162	}
1163
1164	if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1165		err = -EINVAL;
1166		goto out;
1167	}
1168
1169	err = security_task_movememory(task);
1170	if (err)
1171		goto out;
1172
1173	err = do_migrate_pages(mm, &old, &new,
1174		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1175out:
1176	mmput(mm);
1177	return err;
1178}
1179
1180
1181/* Retrieve NUMA policy */
1182asmlinkage long sys_get_mempolicy(int __user *policy,
1183				unsigned long __user *nmask,
1184				unsigned long maxnode,
1185				unsigned long addr, unsigned long flags)
1186{
1187	int err;
1188	int uninitialized_var(pval);
1189	nodemask_t nodes;
1190
1191	if (nmask != NULL && maxnode < MAX_NUMNODES)
1192		return -EINVAL;
1193
1194	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1195
1196	if (err)
1197		return err;
1198
1199	if (policy && put_user(pval, policy))
1200		return -EFAULT;
1201
1202	if (nmask)
1203		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1204
1205	return err;
1206}
1207
1208#ifdef CONFIG_COMPAT
1209
1210asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1211				     compat_ulong_t __user *nmask,
1212				     compat_ulong_t maxnode,
1213				     compat_ulong_t addr, compat_ulong_t flags)
1214{
1215	long err;
1216	unsigned long __user *nm = NULL;
1217	unsigned long nr_bits, alloc_size;
1218	DECLARE_BITMAP(bm, MAX_NUMNODES);
1219
1220	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1221	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1222
1223	if (nmask)
1224		nm = compat_alloc_user_space(alloc_size);
1225
1226	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1227
1228	if (!err && nmask) {
1229		err = copy_from_user(bm, nm, alloc_size);
1230		/* ensure entire bitmap is zeroed */
1231		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1232		err |= compat_put_bitmap(nmask, bm, nr_bits);
1233	}
1234
1235	return err;
1236}
1237
1238asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1239				     compat_ulong_t maxnode)
1240{
1241	long err = 0;
1242	unsigned long __user *nm = NULL;
1243	unsigned long nr_bits, alloc_size;
1244	DECLARE_BITMAP(bm, MAX_NUMNODES);
1245
1246	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1247	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1248
1249	if (nmask) {
1250		err = compat_get_bitmap(bm, nmask, nr_bits);
1251		nm = compat_alloc_user_space(alloc_size);
1252		err |= copy_to_user(nm, bm, alloc_size);
1253	}
1254
1255	if (err)
1256		return -EFAULT;
1257
1258	return sys_set_mempolicy(mode, nm, nr_bits+1);
1259}
1260
1261asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1262			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1263			     compat_ulong_t maxnode, compat_ulong_t flags)
1264{
1265	long err = 0;
1266	unsigned long __user *nm = NULL;
1267	unsigned long nr_bits, alloc_size;
1268	nodemask_t bm;
1269
1270	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1271	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1272
1273	if (nmask) {
1274		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1275		nm = compat_alloc_user_space(alloc_size);
1276		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1277	}
1278
1279	if (err)
1280		return -EFAULT;
1281
1282	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1283}
1284
1285#endif
1286
1287/*
1288 * get_vma_policy(@task, @vma, @addr)
1289 * @task - task for fallback if vma policy == default
1290 * @vma   - virtual memory area whose policy is sought
1291 * @addr  - address in @vma for shared policy lookup
1292 *
1293 * Returns effective policy for a VMA at specified address.
1294 * Falls back to @task or system default policy, as necessary.
1295 * Current or other task's task mempolicy and non-shared vma policies
1296 * are protected by the task's mmap_sem, which must be held for read by
1297 * the caller.
1298 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1299 * count--added by the get_policy() vm_op, as appropriate--to protect against
1300 * freeing by another task.  It is the caller's responsibility to free the
1301 * extra reference for shared policies.
1302 */
1303static struct mempolicy *get_vma_policy(struct task_struct *task,
1304		struct vm_area_struct *vma, unsigned long addr)
1305{
1306	struct mempolicy *pol = task->mempolicy;
1307
1308	if (vma) {
1309		if (vma->vm_ops && vma->vm_ops->get_policy) {
1310			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1311									addr);
1312			if (vpol)
1313				pol = vpol;
1314		} else if (vma->vm_policy)
1315			pol = vma->vm_policy;
1316	}
1317	if (!pol)
1318		pol = &default_policy;
1319	return pol;
1320}
1321
1322/*
1323 * Return a nodemask representing a mempolicy for filtering nodes for
1324 * page allocation
1325 */
1326static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1327{
1328	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1329	if (unlikely(policy->mode == MPOL_BIND) &&
1330			gfp_zone(gfp) >= policy_zone &&
1331			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1332		return &policy->v.nodes;
1333
1334	return NULL;
1335}
1336
1337/* Return a zonelist indicated by gfp for node representing a mempolicy */
1338static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1339{
1340	int nd = numa_node_id();
1341
1342	switch (policy->mode) {
1343	case MPOL_PREFERRED:
1344		if (!(policy->flags & MPOL_F_LOCAL))
1345			nd = policy->v.preferred_node;
1346		break;
1347	case MPOL_BIND:
1348		/*
1349		 * Normally, MPOL_BIND allocations are node-local within the
1350		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1351		 * current node is part of the mask, we use the zonelist for
1352		 * the first node in the mask instead.
1353		 */
1354		if (unlikely(gfp & __GFP_THISNODE) &&
1355				unlikely(!node_isset(nd, policy->v.nodes)))
1356			nd = first_node(policy->v.nodes);
1357		break;
1358	case MPOL_INTERLEAVE: /* should not happen */
1359		break;
1360	default:
1361		BUG();
1362	}
1363	return node_zonelist(nd, gfp);
1364}
1365
1366/* Do dynamic interleaving for a process */
1367static unsigned interleave_nodes(struct mempolicy *policy)
1368{
1369	unsigned nid, next;
1370	struct task_struct *me = current;
1371
1372	nid = me->il_next;
1373	next = next_node(nid, policy->v.nodes);
1374	if (next >= MAX_NUMNODES)
1375		next = first_node(policy->v.nodes);
1376	if (next < MAX_NUMNODES)
1377		me->il_next = next;
1378	return nid;
1379}
1380
1381/*
1382 * Depending on the memory policy provide a node from which to allocate the
1383 * next slab entry.
1384 * @policy must be protected by freeing by the caller.  If @policy is
1385 * the current task's mempolicy, this protection is implicit, as only the
1386 * task can change it's policy.  The system default policy requires no
1387 * such protection.
1388 */
1389unsigned slab_node(struct mempolicy *policy)
1390{
1391	if (!policy || policy->flags & MPOL_F_LOCAL)
1392		return numa_node_id();
1393
1394	switch (policy->mode) {
1395	case MPOL_PREFERRED:
1396		/*
1397		 * handled MPOL_F_LOCAL above
1398		 */
1399		return policy->v.preferred_node;
1400
1401	case MPOL_INTERLEAVE:
1402		return interleave_nodes(policy);
1403
1404	case MPOL_BIND: {
1405		/*
1406		 * Follow bind policy behavior and start allocation at the
1407		 * first node.
1408		 */
1409		struct zonelist *zonelist;
1410		struct zone *zone;
1411		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1412		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1413		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1414							&policy->v.nodes,
1415							&zone);
1416		return zone->node;
1417	}
1418
1419	default:
1420		BUG();
1421	}
1422}
1423
1424/* Do static interleaving for a VMA with known offset. */
1425static unsigned offset_il_node(struct mempolicy *pol,
1426		struct vm_area_struct *vma, unsigned long off)
1427{
1428	unsigned nnodes = nodes_weight(pol->v.nodes);
1429	unsigned target;
1430	int c;
1431	int nid = -1;
1432
1433	if (!nnodes)
1434		return numa_node_id();
1435	target = (unsigned int)off % nnodes;
1436	c = 0;
1437	do {
1438		nid = next_node(nid, pol->v.nodes);
1439		c++;
1440	} while (c <= target);
1441	return nid;
1442}
1443
1444/* Determine a node number for interleave */
1445static inline unsigned interleave_nid(struct mempolicy *pol,
1446		 struct vm_area_struct *vma, unsigned long addr, int shift)
1447{
1448	if (vma) {
1449		unsigned long off;
1450
1451		/*
1452		 * for small pages, there is no difference between
1453		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1454		 * for huge pages, since vm_pgoff is in units of small
1455		 * pages, we need to shift off the always 0 bits to get
1456		 * a useful offset.
1457		 */
1458		BUG_ON(shift < PAGE_SHIFT);
1459		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1460		off += (addr - vma->vm_start) >> shift;
1461		return offset_il_node(pol, vma, off);
1462	} else
1463		return interleave_nodes(pol);
1464}
1465
1466#ifdef CONFIG_HUGETLBFS
1467/*
1468 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1469 * @vma = virtual memory area whose policy is sought
1470 * @addr = address in @vma for shared policy lookup and interleave policy
1471 * @gfp_flags = for requested zone
1472 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1473 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1474 *
1475 * Returns a zonelist suitable for a huge page allocation and a pointer
1476 * to the struct mempolicy for conditional unref after allocation.
1477 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1478 * @nodemask for filtering the zonelist.
1479 */
1480struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1481				gfp_t gfp_flags, struct mempolicy **mpol,
1482				nodemask_t **nodemask)
1483{
1484	struct zonelist *zl;
1485
1486	*mpol = get_vma_policy(current, vma, addr);
1487	*nodemask = NULL;	/* assume !MPOL_BIND */
1488
1489	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1490		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1491				huge_page_shift(hstate_vma(vma))), gfp_flags);
1492	} else {
1493		zl = policy_zonelist(gfp_flags, *mpol);
1494		if ((*mpol)->mode == MPOL_BIND)
1495			*nodemask = &(*mpol)->v.nodes;
1496	}
1497	return zl;
1498}
1499#endif
1500
1501/* Allocate a page in interleaved policy.
1502   Own path because it needs to do special accounting. */
1503static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1504					unsigned nid)
1505{
1506	struct zonelist *zl;
1507	struct page *page;
1508
1509	zl = node_zonelist(nid, gfp);
1510	page = __alloc_pages(gfp, order, zl);
1511	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1512		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1513	return page;
1514}
1515
1516/**
1517 * 	alloc_page_vma	- Allocate a page for a VMA.
1518 *
1519 * 	@gfp:
1520 *      %GFP_USER    user allocation.
1521 *      %GFP_KERNEL  kernel allocations,
1522 *      %GFP_HIGHMEM highmem/user allocations,
1523 *      %GFP_FS      allocation should not call back into a file system.
1524 *      %GFP_ATOMIC  don't sleep.
1525 *
1526 * 	@vma:  Pointer to VMA or NULL if not available.
1527 *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1528 *
1529 * 	This function allocates a page from the kernel page pool and applies
1530 *	a NUMA policy associated with the VMA or the current process.
1531 *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1532 *	mm_struct of the VMA to prevent it from going away. Should be used for
1533 *	all allocations for pages that will be mapped into
1534 * 	user space. Returns NULL when no page can be allocated.
1535 *
1536 *	Should be called with the mm_sem of the vma hold.
1537 */
1538struct page *
1539alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1540{
1541	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1542	struct zonelist *zl;
1543
1544	cpuset_update_task_memory_state();
1545
1546	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1547		unsigned nid;
1548
1549		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1550		mpol_cond_put(pol);
1551		return alloc_page_interleave(gfp, 0, nid);
1552	}
1553	zl = policy_zonelist(gfp, pol);
1554	if (unlikely(mpol_needs_cond_ref(pol))) {
1555		/*
1556		 * slow path: ref counted shared policy
1557		 */
1558		struct page *page =  __alloc_pages_nodemask(gfp, 0,
1559						zl, policy_nodemask(gfp, pol));
1560		__mpol_put(pol);
1561		return page;
1562	}
1563	/*
1564	 * fast path:  default or task policy
1565	 */
1566	return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1567}
1568
1569/**
1570 * 	alloc_pages_current - Allocate pages.
1571 *
1572 *	@gfp:
1573 *		%GFP_USER   user allocation,
1574 *      	%GFP_KERNEL kernel allocation,
1575 *      	%GFP_HIGHMEM highmem allocation,
1576 *      	%GFP_FS     don't call back into a file system.
1577 *      	%GFP_ATOMIC don't sleep.
1578 *	@order: Power of two of allocation size in pages. 0 is a single page.
1579 *
1580 *	Allocate a page from the kernel page pool.  When not in
1581 *	interrupt context and apply the current process NUMA policy.
1582 *	Returns NULL when no page can be allocated.
1583 *
1584 *	Don't call cpuset_update_task_memory_state() unless
1585 *	1) it's ok to take cpuset_sem (can WAIT), and
1586 *	2) allocating for current task (not interrupt).
1587 */
1588struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1589{
1590	struct mempolicy *pol = current->mempolicy;
1591
1592	if ((gfp & __GFP_WAIT) && !in_interrupt())
1593		cpuset_update_task_memory_state();
1594	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1595		pol = &default_policy;
1596
1597	/*
1598	 * No reference counting needed for current->mempolicy
1599	 * nor system default_policy
1600	 */
1601	if (pol->mode == MPOL_INTERLEAVE)
1602		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1603	return __alloc_pages_nodemask(gfp, order,
1604			policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1605}
1606EXPORT_SYMBOL(alloc_pages_current);
1607
1608/*
1609 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1610 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1611 * with the mems_allowed returned by cpuset_mems_allowed().  This
1612 * keeps mempolicies cpuset relative after its cpuset moves.  See
1613 * further kernel/cpuset.c update_nodemask().
1614 */
1615
1616/* Slow path of a mempolicy duplicate */
1617struct mempolicy *__mpol_dup(struct mempolicy *old)
1618{
1619	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1620
1621	if (!new)
1622		return ERR_PTR(-ENOMEM);
1623	if (current_cpuset_is_being_rebound()) {
1624		nodemask_t mems = cpuset_mems_allowed(current);
1625		mpol_rebind_policy(old, &mems);
1626	}
1627	*new = *old;
1628	atomic_set(&new->refcnt, 1);
1629	return new;
1630}
1631
1632/*
1633 * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1634 * eliminate the * MPOL_F_* flags that require conditional ref and
1635 * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1636 * after return.  Use the returned value.
1637 *
1638 * Allows use of a mempolicy for, e.g., multiple allocations with a single
1639 * policy lookup, even if the policy needs/has extra ref on lookup.
1640 * shmem_readahead needs this.
1641 */
1642struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1643						struct mempolicy *frompol)
1644{
1645	if (!mpol_needs_cond_ref(frompol))
1646		return frompol;
1647
1648	*tompol = *frompol;
1649	tompol->flags &= ~MPOL_F_SHARED;	/* copy doesn't need unref */
1650	__mpol_put(frompol);
1651	return tompol;
1652}
1653
1654static int mpol_match_intent(const struct mempolicy *a,
1655			     const struct mempolicy *b)
1656{
1657	if (a->flags != b->flags)
1658		return 0;
1659	if (!mpol_store_user_nodemask(a))
1660		return 1;
1661	return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1662}
1663
1664/* Slow path of a mempolicy comparison */
1665int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1666{
1667	if (!a || !b)
1668		return 0;
1669	if (a->mode != b->mode)
1670		return 0;
1671	if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1672		return 0;
1673	switch (a->mode) {
1674	case MPOL_BIND:
1675		/* Fall through */
1676	case MPOL_INTERLEAVE:
1677		return nodes_equal(a->v.nodes, b->v.nodes);
1678	case MPOL_PREFERRED:
1679		return a->v.preferred_node == b->v.preferred_node &&
1680			a->flags == b->flags;
1681	default:
1682		BUG();
1683		return 0;
1684	}
1685}
1686
1687/*
1688 * Shared memory backing store policy support.
1689 *
1690 * Remember policies even when nobody has shared memory mapped.
1691 * The policies are kept in Red-Black tree linked from the inode.
1692 * They are protected by the sp->lock spinlock, which should be held
1693 * for any accesses to the tree.
1694 */
1695
1696/* lookup first element intersecting start-end */
1697/* Caller holds sp->lock */
1698static struct sp_node *
1699sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1700{
1701	struct rb_node *n = sp->root.rb_node;
1702
1703	while (n) {
1704		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1705
1706		if (start >= p->end)
1707			n = n->rb_right;
1708		else if (end <= p->start)
1709			n = n->rb_left;
1710		else
1711			break;
1712	}
1713	if (!n)
1714		return NULL;
1715	for (;;) {
1716		struct sp_node *w = NULL;
1717		struct rb_node *prev = rb_prev(n);
1718		if (!prev)
1719			break;
1720		w = rb_entry(prev, struct sp_node, nd);
1721		if (w->end <= start)
1722			break;
1723		n = prev;
1724	}
1725	return rb_entry(n, struct sp_node, nd);
1726}
1727
1728/* Insert a new shared policy into the list. */
1729/* Caller holds sp->lock */
1730static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1731{
1732	struct rb_node **p = &sp->root.rb_node;
1733	struct rb_node *parent = NULL;
1734	struct sp_node *nd;
1735
1736	while (*p) {
1737		parent = *p;
1738		nd = rb_entry(parent, struct sp_node, nd);
1739		if (new->start < nd->start)
1740			p = &(*p)->rb_left;
1741		else if (new->end > nd->end)
1742			p = &(*p)->rb_right;
1743		else
1744			BUG();
1745	}
1746	rb_link_node(&new->nd, parent, p);
1747	rb_insert_color(&new->nd, &sp->root);
1748	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1749		 new->policy ? new->policy->mode : 0);
1750}
1751
1752/* Find shared policy intersecting idx */
1753struct mempolicy *
1754mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1755{
1756	struct mempolicy *pol = NULL;
1757	struct sp_node *sn;
1758
1759	if (!sp->root.rb_node)
1760		return NULL;
1761	spin_lock(&sp->lock);
1762	sn = sp_lookup(sp, idx, idx+1);
1763	if (sn) {
1764		mpol_get(sn->policy);
1765		pol = sn->policy;
1766	}
1767	spin_unlock(&sp->lock);
1768	return pol;
1769}
1770
1771static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1772{
1773	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1774	rb_erase(&n->nd, &sp->root);
1775	mpol_put(n->policy);
1776	kmem_cache_free(sn_cache, n);
1777}
1778
1779static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1780				struct mempolicy *pol)
1781{
1782	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1783
1784	if (!n)
1785		return NULL;
1786	n->start = start;
1787	n->end = end;
1788	mpol_get(pol);
1789	pol->flags |= MPOL_F_SHARED;	/* for unref */
1790	n->policy = pol;
1791	return n;
1792}
1793
1794/* Replace a policy range. */
1795static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1796				 unsigned long end, struct sp_node *new)
1797{
1798	struct sp_node *n, *new2 = NULL;
1799
1800restart:
1801	spin_lock(&sp->lock);
1802	n = sp_lookup(sp, start, end);
1803	/* Take care of old policies in the same range. */
1804	while (n && n->start < end) {
1805		struct rb_node *next = rb_next(&n->nd);
1806		if (n->start >= start) {
1807			if (n->end <= end)
1808				sp_delete(sp, n);
1809			else
1810				n->start = end;
1811		} else {
1812			/* Old policy spanning whole new range. */
1813			if (n->end > end) {
1814				if (!new2) {
1815					spin_unlock(&sp->lock);
1816					new2 = sp_alloc(end, n->end, n->policy);
1817					if (!new2)
1818						return -ENOMEM;
1819					goto restart;
1820				}
1821				n->end = start;
1822				sp_insert(sp, new2);
1823				new2 = NULL;
1824				break;
1825			} else
1826				n->end = start;
1827		}
1828		if (!next)
1829			break;
1830		n = rb_entry(next, struct sp_node, nd);
1831	}
1832	if (new)
1833		sp_insert(sp, new);
1834	spin_unlock(&sp->lock);
1835	if (new2) {
1836		mpol_put(new2->policy);
1837		kmem_cache_free(sn_cache, new2);
1838	}
1839	return 0;
1840}
1841
1842/**
1843 * mpol_shared_policy_init - initialize shared policy for inode
1844 * @sp: pointer to inode shared policy
1845 * @mpol:  struct mempolicy to install
1846 *
1847 * Install non-NULL @mpol in inode's shared policy rb-tree.
1848 * On entry, the current task has a reference on a non-NULL @mpol.
1849 * This must be released on exit.
1850 */
1851void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1852{
1853	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
1854	spin_lock_init(&sp->lock);
1855
1856	if (mpol) {
1857		struct vm_area_struct pvma;
1858		struct mempolicy *new;
1859
1860		/* contextualize the tmpfs mount point mempolicy */
1861		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1862		mpol_put(mpol);	/* drop our ref on sb mpol */
1863		if (IS_ERR(new))
1864			return;		/* no valid nodemask intersection */
1865
1866		/* Create pseudo-vma that contains just the policy */
1867		memset(&pvma, 0, sizeof(struct vm_area_struct));
1868		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
1869		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
1870		mpol_put(new);			/* drop initial ref */
1871	}
1872}
1873
1874int mpol_set_shared_policy(struct shared_policy *info,
1875			struct vm_area_struct *vma, struct mempolicy *npol)
1876{
1877	int err;
1878	struct sp_node *new = NULL;
1879	unsigned long sz = vma_pages(vma);
1880
1881	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1882		 vma->vm_pgoff,
1883		 sz, npol ? npol->mode : -1,
1884		 npol ? npol->flags : -1,
1885		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1886
1887	if (npol) {
1888		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1889		if (!new)
1890			return -ENOMEM;
1891	}
1892	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1893	if (err && new)
1894		kmem_cache_free(sn_cache, new);
1895	return err;
1896}
1897
1898/* Free a backing policy store on inode delete. */
1899void mpol_free_shared_policy(struct shared_policy *p)
1900{
1901	struct sp_node *n;
1902	struct rb_node *next;
1903
1904	if (!p->root.rb_node)
1905		return;
1906	spin_lock(&p->lock);
1907	next = rb_first(&p->root);
1908	while (next) {
1909		n = rb_entry(next, struct sp_node, nd);
1910		next = rb_next(&n->nd);
1911		rb_erase(&n->nd, &p->root);
1912		mpol_put(n->policy);
1913		kmem_cache_free(sn_cache, n);
1914	}
1915	spin_unlock(&p->lock);
1916}
1917
1918/* assumes fs == KERNEL_DS */
1919void __init numa_policy_init(void)
1920{
1921	nodemask_t interleave_nodes;
1922	unsigned long largest = 0;
1923	int nid, prefer = 0;
1924
1925	policy_cache = kmem_cache_create("numa_policy",
1926					 sizeof(struct mempolicy),
1927					 0, SLAB_PANIC, NULL);
1928
1929	sn_cache = kmem_cache_create("shared_policy_node",
1930				     sizeof(struct sp_node),
1931				     0, SLAB_PANIC, NULL);
1932
1933	/*
1934	 * Set interleaving policy for system init. Interleaving is only
1935	 * enabled across suitably sized nodes (default is >= 16MB), or
1936	 * fall back to the largest node if they're all smaller.
1937	 */
1938	nodes_clear(interleave_nodes);
1939	for_each_node_state(nid, N_HIGH_MEMORY) {
1940		unsigned long total_pages = node_present_pages(nid);
1941
1942		/* Preserve the largest node */
1943		if (largest < total_pages) {
1944			largest = total_pages;
1945			prefer = nid;
1946		}
1947
1948		/* Interleave this node? */
1949		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1950			node_set(nid, interleave_nodes);
1951	}
1952
1953	/* All too small, use the largest */
1954	if (unlikely(nodes_empty(interleave_nodes)))
1955		node_set(prefer, interleave_nodes);
1956
1957	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1958		printk("numa_policy_init: interleaving failed\n");
1959}
1960
1961/* Reset policy of current process to default */
1962void numa_default_policy(void)
1963{
1964	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1965}
1966
1967/*
1968 * Parse and format mempolicy from/to strings
1969 */
1970
1971/*
1972 * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
1973 * Used only for mpol_parse_str() and mpol_to_str()
1974 */
1975#define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
1976static const char * const policy_types[] =
1977	{ "default", "prefer", "bind", "interleave", "local" };
1978
1979
1980#ifdef CONFIG_TMPFS
1981/**
1982 * mpol_parse_str - parse string to mempolicy
1983 * @str:  string containing mempolicy to parse
1984 * @mpol:  pointer to struct mempolicy pointer, returned on success.
1985 * @no_context:  flag whether to "contextualize" the mempolicy
1986 *
1987 * Format of input:
1988 *	<mode>[=<flags>][:<nodelist>]
1989 *
1990 * if @no_context is true, save the input nodemask in w.user_nodemask in
1991 * the returned mempolicy.  This will be used to "clone" the mempolicy in
1992 * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
1993 * mount option.  Note that if 'static' or 'relative' mode flags were
1994 * specified, the input nodemask will already have been saved.  Saving
1995 * it again is redundant, but safe.
1996 *
1997 * On success, returns 0, else 1
1998 */
1999int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2000{
2001	struct mempolicy *new = NULL;
2002	unsigned short uninitialized_var(mode);
2003	unsigned short uninitialized_var(mode_flags);
2004	nodemask_t nodes;
2005	char *nodelist = strchr(str, ':');
2006	char *flags = strchr(str, '=');
2007	int i;
2008	int err = 1;
2009
2010	if (nodelist) {
2011		/* NUL-terminate mode or flags string */
2012		*nodelist++ = '\0';
2013		if (nodelist_parse(nodelist, nodes))
2014			goto out;
2015		if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2016			goto out;
2017	} else
2018		nodes_clear(nodes);
2019
2020	if (flags)
2021		*flags++ = '\0';	/* terminate mode string */
2022
2023	for (i = 0; i <= MPOL_LOCAL; i++) {
2024		if (!strcmp(str, policy_types[i])) {
2025			mode = i;
2026			break;
2027		}
2028	}
2029	if (i > MPOL_LOCAL)
2030		goto out;
2031
2032	switch (mode) {
2033	case MPOL_PREFERRED:
2034		/*
2035		 * Insist on a nodelist of one node only
2036		 */
2037		if (nodelist) {
2038			char *rest = nodelist;
2039			while (isdigit(*rest))
2040				rest++;
2041			if (!*rest)
2042				err = 0;
2043		}
2044		break;
2045	case MPOL_INTERLEAVE:
2046		/*
2047		 * Default to online nodes with memory if no nodelist
2048		 */
2049		if (!nodelist)
2050			nodes = node_states[N_HIGH_MEMORY];
2051		err = 0;
2052		break;
2053	case MPOL_LOCAL:
2054		/*
2055		 * Don't allow a nodelist;  mpol_new() checks flags
2056		 */
2057		if (nodelist)
2058			goto out;
2059		mode = MPOL_PREFERRED;
2060		break;
2061
2062	/*
2063	 * case MPOL_BIND:    mpol_new() enforces non-empty nodemask.
2064	 * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
2065	 */
2066	}
2067
2068	mode_flags = 0;
2069	if (flags) {
2070		/*
2071		 * Currently, we only support two mutually exclusive
2072		 * mode flags.
2073		 */
2074		if (!strcmp(flags, "static"))
2075			mode_flags |= MPOL_F_STATIC_NODES;
2076		else if (!strcmp(flags, "relative"))
2077			mode_flags |= MPOL_F_RELATIVE_NODES;
2078		else
2079			err = 1;
2080	}
2081
2082	new = mpol_new(mode, mode_flags, &nodes);
2083	if (IS_ERR(new))
2084		err = 1;
2085	else if (no_context)
2086		new->w.user_nodemask = nodes;	/* save for contextualization */
2087
2088out:
2089	/* Restore string for error message */
2090	if (nodelist)
2091		*--nodelist = ':';
2092	if (flags)
2093		*--flags = '=';
2094	if (!err)
2095		*mpol = new;
2096	return err;
2097}
2098#endif /* CONFIG_TMPFS */
2099
2100/**
2101 * mpol_to_str - format a mempolicy structure for printing
2102 * @buffer:  to contain formatted mempolicy string
2103 * @maxlen:  length of @buffer
2104 * @pol:  pointer to mempolicy to be formatted
2105 * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2106 *
2107 * Convert a mempolicy into a string.
2108 * Returns the number of characters in buffer (if positive)
2109 * or an error (negative)
2110 */
2111int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2112{
2113	char *p = buffer;
2114	int l;
2115	nodemask_t nodes;
2116	unsigned short mode;
2117	unsigned short flags = pol ? pol->flags : 0;
2118
2119	/*
2120	 * Sanity check:  room for longest mode, flag and some nodes
2121	 */
2122	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2123
2124	if (!pol || pol == &default_policy)
2125		mode = MPOL_DEFAULT;
2126	else
2127		mode = pol->mode;
2128
2129	switch (mode) {
2130	case MPOL_DEFAULT:
2131		nodes_clear(nodes);
2132		break;
2133
2134	case MPOL_PREFERRED:
2135		nodes_clear(nodes);
2136		if (flags & MPOL_F_LOCAL)
2137			mode = MPOL_LOCAL;	/* pseudo-policy */
2138		else
2139			node_set(pol->v.preferred_node, nodes);
2140		break;
2141
2142	case MPOL_BIND:
2143		/* Fall through */
2144	case MPOL_INTERLEAVE:
2145		if (no_context)
2146			nodes = pol->w.user_nodemask;
2147		else
2148			nodes = pol->v.nodes;
2149		break;
2150
2151	default:
2152		BUG();
2153	}
2154
2155	l = strlen(policy_types[mode]);
2156	if (buffer + maxlen < p + l + 1)
2157		return -ENOSPC;
2158
2159	strcpy(p, policy_types[mode]);
2160	p += l;
2161
2162	if (flags & MPOL_MODE_FLAGS) {
2163		if (buffer + maxlen < p + 2)
2164			return -ENOSPC;
2165		*p++ = '=';
2166
2167		/*
2168		 * Currently, the only defined flags are mutually exclusive
2169		 */
2170		if (flags & MPOL_F_STATIC_NODES)
2171			p += snprintf(p, buffer + maxlen - p, "static");
2172		else if (flags & MPOL_F_RELATIVE_NODES)
2173			p += snprintf(p, buffer + maxlen - p, "relative");
2174	}
2175
2176	if (!nodes_empty(nodes)) {
2177		if (buffer + maxlen < p + 2)
2178			return -ENOSPC;
2179		*p++ = ':';
2180	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2181	}
2182	return p - buffer;
2183}
2184
2185struct numa_maps {
2186	unsigned long pages;
2187	unsigned long anon;
2188	unsigned long active;
2189	unsigned long writeback;
2190	unsigned long mapcount_max;
2191	unsigned long dirty;
2192	unsigned long swapcache;
2193	unsigned long node[MAX_NUMNODES];
2194};
2195
2196static void gather_stats(struct page *page, void *private, int pte_dirty)
2197{
2198	struct numa_maps *md = private;
2199	int count = page_mapcount(page);
2200
2201	md->pages++;
2202	if (pte_dirty || PageDirty(page))
2203		md->dirty++;
2204
2205	if (PageSwapCache(page))
2206		md->swapcache++;
2207
2208	if (PageActive(page) || PageUnevictable(page))
2209		md->active++;
2210
2211	if (PageWriteback(page))
2212		md->writeback++;
2213
2214	if (PageAnon(page))
2215		md->anon++;
2216
2217	if (count > md->mapcount_max)
2218		md->mapcount_max = count;
2219
2220	md->node[page_to_nid(page)]++;
2221}
2222
2223#ifdef CONFIG_HUGETLB_PAGE
2224static void check_huge_range(struct vm_area_struct *vma,
2225		unsigned long start, unsigned long end,
2226		struct numa_maps *md)
2227{
2228	unsigned long addr;
2229	struct page *page;
2230	struct hstate *h = hstate_vma(vma);
2231	unsigned long sz = huge_page_size(h);
2232
2233	for (addr = start; addr < end; addr += sz) {
2234		pte_t *ptep = huge_pte_offset(vma->vm_mm,
2235						addr & huge_page_mask(h));
2236		pte_t pte;
2237
2238		if (!ptep)
2239			continue;
2240
2241		pte = *ptep;
2242		if (pte_none(pte))
2243			continue;
2244
2245		page = pte_page(pte);
2246		if (!page)
2247			continue;
2248
2249		gather_stats(page, md, pte_dirty(*ptep));
2250	}
2251}
2252#else
2253static inline void check_huge_range(struct vm_area_struct *vma,
2254		unsigned long start, unsigned long end,
2255		struct numa_maps *md)
2256{
2257}
2258#endif
2259
2260/*
2261 * Display pages allocated per node and memory policy via /proc.
2262 */
2263int show_numa_map(struct seq_file *m, void *v)
2264{
2265	struct proc_maps_private *priv = m->private;
2266	struct vm_area_struct *vma = v;
2267	struct numa_maps *md;
2268	struct file *file = vma->vm_file;
2269	struct mm_struct *mm = vma->vm_mm;
2270	struct mempolicy *pol;
2271	int n;
2272	char buffer[50];
2273
2274	if (!mm)
2275		return 0;
2276
2277	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2278	if (!md)
2279		return 0;
2280
2281	pol = get_vma_policy(priv->task, vma, vma->vm_start);
2282	mpol_to_str(buffer, sizeof(buffer), pol, 0);
2283	mpol_cond_put(pol);
2284
2285	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2286
2287	if (file) {
2288		seq_printf(m, " file=");
2289		seq_path(m, &file->f_path, "\n\t= ");
2290	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2291		seq_printf(m, " heap");
2292	} else if (vma->vm_start <= mm->start_stack &&
2293			vma->vm_end >= mm->start_stack) {
2294		seq_printf(m, " stack");
2295	}
2296
2297	if (is_vm_hugetlb_page(vma)) {
2298		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2299		seq_printf(m, " huge");
2300	} else {
2301		check_pgd_range(vma, vma->vm_start, vma->vm_end,
2302			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2303	}
2304
2305	if (!md->pages)
2306		goto out;
2307
2308	if (md->anon)
2309		seq_printf(m," anon=%lu",md->anon);
2310
2311	if (md->dirty)
2312		seq_printf(m," dirty=%lu",md->dirty);
2313
2314	if (md->pages != md->anon && md->pages != md->dirty)
2315		seq_printf(m, " mapped=%lu", md->pages);
2316
2317	if (md->mapcount_max > 1)
2318		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2319
2320	if (md->swapcache)
2321		seq_printf(m," swapcache=%lu", md->swapcache);
2322
2323	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2324		seq_printf(m," active=%lu", md->active);
2325
2326	if (md->writeback)
2327		seq_printf(m," writeback=%lu", md->writeback);
2328
2329	for_each_node_state(n, N_HIGH_MEMORY)
2330		if (md->node[n])
2331			seq_printf(m, " N%d=%lu", n, md->node[n]);
2332out:
2333	seq_putc(m, '\n');
2334	kfree(md);
2335
2336	if (m->count < m->size)
2337		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2338	return 0;
2339}
2340