mempolicy.c revision d79df630f622806c4d0e116fbaf6ebf6baf53461
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave     Allocate memory interleaved over a set of nodes,
16 *                with normal fallback if it fails.
17 *                For VMA based allocations this interleaves based on the
18 *                offset into the backing object or offset into the mapping
19 *                for anonymous memory. For process policy an process counter
20 *                is used.
21 *
22 * bind           Only allocate memory on a specific set of nodes,
23 *                no fallback.
24 *                FIXME: memory is allocated starting with the first node
25 *                to the last. It would be better if bind would truly restrict
26 *                the allocation to memory nodes instead
27 *
28 * preferred       Try a specific node first before normal fallback.
29 *                As a special case node -1 here means do the allocation
30 *                on the local CPU. This is normally identical to default,
31 *                but useful to set in a VMA when you have a non default
32 *                process policy.
33 *
34 * default        Allocate on the local node first, or when on a VMA
35 *                use the process policy. This is what Linux always did
36 *		  in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57   fix mmap readahead to honour policy and enable policy for any page cache
58   object
59   statistics for bigpages
60   global policy for page cache? currently it uses process policy. Requires
61   first item above.
62   handle mremap for shared memory (currently ignored for the policy)
63   grows down?
64   make bind policy root only? It can trigger oom much faster and the
65   kernel is not always grateful with that.
66*/
67
68#include <linux/mempolicy.h>
69#include <linux/mm.h>
70#include <linux/highmem.h>
71#include <linux/hugetlb.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
74#include <linux/nodemask.h>
75#include <linux/cpuset.h>
76#include <linux/gfp.h>
77#include <linux/slab.h>
78#include <linux/string.h>
79#include <linux/module.h>
80#include <linux/nsproxy.h>
81#include <linux/interrupt.h>
82#include <linux/init.h>
83#include <linux/compat.h>
84#include <linux/swap.h>
85#include <linux/seq_file.h>
86#include <linux/proc_fs.h>
87#include <linux/migrate.h>
88#include <linux/rmap.h>
89#include <linux/security.h>
90#include <linux/syscalls.h>
91#include <linux/ctype.h>
92
93#include <asm/tlbflush.h>
94#include <asm/uaccess.h>
95
96/* Internal flags */
97#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
98#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
99#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
100
101static struct kmem_cache *policy_cache;
102static struct kmem_cache *sn_cache;
103
104/* Highest zone. An specific allocation for a zone below that is not
105   policied. */
106enum zone_type policy_zone = 0;
107
108/*
109 * run-time system-wide default policy => local allocation
110 */
111struct mempolicy default_policy = {
112	.refcnt = ATOMIC_INIT(1), /* never free it */
113	.mode = MPOL_PREFERRED,
114	.flags = MPOL_F_LOCAL,
115};
116
117static const struct mempolicy_operations {
118	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
119	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
120} mpol_ops[MPOL_MAX];
121
122/* Check that the nodemask contains at least one populated zone */
123static int is_valid_nodemask(const nodemask_t *nodemask)
124{
125	int nd, k;
126
127	/* Check that there is something useful in this mask */
128	k = policy_zone;
129
130	for_each_node_mask(nd, *nodemask) {
131		struct zone *z;
132
133		for (k = 0; k <= policy_zone; k++) {
134			z = &NODE_DATA(nd)->node_zones[k];
135			if (z->present_pages > 0)
136				return 1;
137		}
138	}
139
140	return 0;
141}
142
143static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
144{
145	return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
146}
147
148static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
149				   const nodemask_t *rel)
150{
151	nodemask_t tmp;
152	nodes_fold(tmp, *orig, nodes_weight(*rel));
153	nodes_onto(*ret, tmp, *rel);
154}
155
156static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
157{
158	if (nodes_empty(*nodes))
159		return -EINVAL;
160	pol->v.nodes = *nodes;
161	return 0;
162}
163
164static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
165{
166	if (!nodes)
167		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
168	else if (nodes_empty(*nodes))
169		return -EINVAL;			/*  no allowed nodes */
170	else
171		pol->v.preferred_node = first_node(*nodes);
172	return 0;
173}
174
175static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
176{
177	if (!is_valid_nodemask(nodes))
178		return -EINVAL;
179	pol->v.nodes = *nodes;
180	return 0;
181}
182
183/* Create a new policy */
184static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
185				  nodemask_t *nodes)
186{
187	struct mempolicy *policy;
188	nodemask_t cpuset_context_nmask;
189	int ret;
190
191	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
192		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
193
194	if (mode == MPOL_DEFAULT) {
195		if (nodes && !nodes_empty(*nodes))
196			return ERR_PTR(-EINVAL);
197		return NULL;	/* simply delete any existing policy */
198	}
199	VM_BUG_ON(!nodes);
200
201	/*
202	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
203	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
204	 * All other modes require a valid pointer to a non-empty nodemask.
205	 */
206	if (mode == MPOL_PREFERRED) {
207		if (nodes_empty(*nodes)) {
208			if (((flags & MPOL_F_STATIC_NODES) ||
209			     (flags & MPOL_F_RELATIVE_NODES)))
210				return ERR_PTR(-EINVAL);
211			nodes = NULL;	/* flag local alloc */
212		}
213	} else if (nodes_empty(*nodes))
214		return ERR_PTR(-EINVAL);
215	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
216	if (!policy)
217		return ERR_PTR(-ENOMEM);
218	atomic_set(&policy->refcnt, 1);
219	policy->mode = mode;
220	policy->flags = flags;
221
222	if (nodes) {
223		/*
224		 * cpuset related setup doesn't apply to local allocation
225		 */
226		cpuset_update_task_memory_state();
227		if (flags & MPOL_F_RELATIVE_NODES)
228			mpol_relative_nodemask(&cpuset_context_nmask, nodes,
229					       &cpuset_current_mems_allowed);
230		else
231			nodes_and(cpuset_context_nmask, *nodes,
232				  cpuset_current_mems_allowed);
233		if (mpol_store_user_nodemask(policy))
234			policy->w.user_nodemask = *nodes;
235		else
236			policy->w.cpuset_mems_allowed =
237						cpuset_mems_allowed(current);
238	}
239
240	ret = mpol_ops[mode].create(policy,
241				nodes ? &cpuset_context_nmask : NULL);
242	if (ret < 0) {
243		kmem_cache_free(policy_cache, policy);
244		return ERR_PTR(ret);
245	}
246	return policy;
247}
248
249/* Slow path of a mpol destructor. */
250void __mpol_put(struct mempolicy *p)
251{
252	if (!atomic_dec_and_test(&p->refcnt))
253		return;
254	kmem_cache_free(policy_cache, p);
255}
256
257static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
258{
259}
260
261static void mpol_rebind_nodemask(struct mempolicy *pol,
262				 const nodemask_t *nodes)
263{
264	nodemask_t tmp;
265
266	if (pol->flags & MPOL_F_STATIC_NODES)
267		nodes_and(tmp, pol->w.user_nodemask, *nodes);
268	else if (pol->flags & MPOL_F_RELATIVE_NODES)
269		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
270	else {
271		nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
272			    *nodes);
273		pol->w.cpuset_mems_allowed = *nodes;
274	}
275
276	pol->v.nodes = tmp;
277	if (!node_isset(current->il_next, tmp)) {
278		current->il_next = next_node(current->il_next, tmp);
279		if (current->il_next >= MAX_NUMNODES)
280			current->il_next = first_node(tmp);
281		if (current->il_next >= MAX_NUMNODES)
282			current->il_next = numa_node_id();
283	}
284}
285
286static void mpol_rebind_preferred(struct mempolicy *pol,
287				  const nodemask_t *nodes)
288{
289	nodemask_t tmp;
290
291	if (pol->flags & MPOL_F_STATIC_NODES) {
292		int node = first_node(pol->w.user_nodemask);
293
294		if (node_isset(node, *nodes)) {
295			pol->v.preferred_node = node;
296			pol->flags &= ~MPOL_F_LOCAL;
297		} else
298			pol->flags |= MPOL_F_LOCAL;
299	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
300		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
301		pol->v.preferred_node = first_node(tmp);
302	} else if (!(pol->flags & MPOL_F_LOCAL)) {
303		pol->v.preferred_node = node_remap(pol->v.preferred_node,
304						   pol->w.cpuset_mems_allowed,
305						   *nodes);
306		pol->w.cpuset_mems_allowed = *nodes;
307	}
308}
309
310/* Migrate a policy to a different set of nodes */
311static void mpol_rebind_policy(struct mempolicy *pol,
312			       const nodemask_t *newmask)
313{
314	if (!pol)
315		return;
316	if (!mpol_store_user_nodemask(pol) &&
317	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
318		return;
319	mpol_ops[pol->mode].rebind(pol, newmask);
320}
321
322/*
323 * Wrapper for mpol_rebind_policy() that just requires task
324 * pointer, and updates task mempolicy.
325 */
326
327void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
328{
329	mpol_rebind_policy(tsk->mempolicy, new);
330}
331
332/*
333 * Rebind each vma in mm to new nodemask.
334 *
335 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
336 */
337
338void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
339{
340	struct vm_area_struct *vma;
341
342	down_write(&mm->mmap_sem);
343	for (vma = mm->mmap; vma; vma = vma->vm_next)
344		mpol_rebind_policy(vma->vm_policy, new);
345	up_write(&mm->mmap_sem);
346}
347
348static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
349	[MPOL_DEFAULT] = {
350		.rebind = mpol_rebind_default,
351	},
352	[MPOL_INTERLEAVE] = {
353		.create = mpol_new_interleave,
354		.rebind = mpol_rebind_nodemask,
355	},
356	[MPOL_PREFERRED] = {
357		.create = mpol_new_preferred,
358		.rebind = mpol_rebind_preferred,
359	},
360	[MPOL_BIND] = {
361		.create = mpol_new_bind,
362		.rebind = mpol_rebind_nodemask,
363	},
364};
365
366static void gather_stats(struct page *, void *, int pte_dirty);
367static void migrate_page_add(struct page *page, struct list_head *pagelist,
368				unsigned long flags);
369
370/* Scan through pages checking if pages follow certain conditions. */
371static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
372		unsigned long addr, unsigned long end,
373		const nodemask_t *nodes, unsigned long flags,
374		void *private)
375{
376	pte_t *orig_pte;
377	pte_t *pte;
378	spinlock_t *ptl;
379
380	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
381	do {
382		struct page *page;
383		int nid;
384
385		if (!pte_present(*pte))
386			continue;
387		page = vm_normal_page(vma, addr, *pte);
388		if (!page)
389			continue;
390		/*
391		 * The check for PageReserved here is important to avoid
392		 * handling zero pages and other pages that may have been
393		 * marked special by the system.
394		 *
395		 * If the PageReserved would not be checked here then f.e.
396		 * the location of the zero page could have an influence
397		 * on MPOL_MF_STRICT, zero pages would be counted for
398		 * the per node stats, and there would be useless attempts
399		 * to put zero pages on the migration list.
400		 */
401		if (PageReserved(page))
402			continue;
403		nid = page_to_nid(page);
404		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
405			continue;
406
407		if (flags & MPOL_MF_STATS)
408			gather_stats(page, private, pte_dirty(*pte));
409		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
410			migrate_page_add(page, private, flags);
411		else
412			break;
413	} while (pte++, addr += PAGE_SIZE, addr != end);
414	pte_unmap_unlock(orig_pte, ptl);
415	return addr != end;
416}
417
418static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
419		unsigned long addr, unsigned long end,
420		const nodemask_t *nodes, unsigned long flags,
421		void *private)
422{
423	pmd_t *pmd;
424	unsigned long next;
425
426	pmd = pmd_offset(pud, addr);
427	do {
428		next = pmd_addr_end(addr, end);
429		if (pmd_none_or_clear_bad(pmd))
430			continue;
431		if (check_pte_range(vma, pmd, addr, next, nodes,
432				    flags, private))
433			return -EIO;
434	} while (pmd++, addr = next, addr != end);
435	return 0;
436}
437
438static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
439		unsigned long addr, unsigned long end,
440		const nodemask_t *nodes, unsigned long flags,
441		void *private)
442{
443	pud_t *pud;
444	unsigned long next;
445
446	pud = pud_offset(pgd, addr);
447	do {
448		next = pud_addr_end(addr, end);
449		if (pud_none_or_clear_bad(pud))
450			continue;
451		if (check_pmd_range(vma, pud, addr, next, nodes,
452				    flags, private))
453			return -EIO;
454	} while (pud++, addr = next, addr != end);
455	return 0;
456}
457
458static inline int check_pgd_range(struct vm_area_struct *vma,
459		unsigned long addr, unsigned long end,
460		const nodemask_t *nodes, unsigned long flags,
461		void *private)
462{
463	pgd_t *pgd;
464	unsigned long next;
465
466	pgd = pgd_offset(vma->vm_mm, addr);
467	do {
468		next = pgd_addr_end(addr, end);
469		if (pgd_none_or_clear_bad(pgd))
470			continue;
471		if (check_pud_range(vma, pgd, addr, next, nodes,
472				    flags, private))
473			return -EIO;
474	} while (pgd++, addr = next, addr != end);
475	return 0;
476}
477
478/*
479 * Check if all pages in a range are on a set of nodes.
480 * If pagelist != NULL then isolate pages from the LRU and
481 * put them on the pagelist.
482 */
483static struct vm_area_struct *
484check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
485		const nodemask_t *nodes, unsigned long flags, void *private)
486{
487	int err;
488	struct vm_area_struct *first, *vma, *prev;
489
490	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
491
492		err = migrate_prep();
493		if (err)
494			return ERR_PTR(err);
495	}
496
497	first = find_vma(mm, start);
498	if (!first)
499		return ERR_PTR(-EFAULT);
500	prev = NULL;
501	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
502		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
503			if (!vma->vm_next && vma->vm_end < end)
504				return ERR_PTR(-EFAULT);
505			if (prev && prev->vm_end < vma->vm_start)
506				return ERR_PTR(-EFAULT);
507		}
508		if (!is_vm_hugetlb_page(vma) &&
509		    ((flags & MPOL_MF_STRICT) ||
510		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
511				vma_migratable(vma)))) {
512			unsigned long endvma = vma->vm_end;
513
514			if (endvma > end)
515				endvma = end;
516			if (vma->vm_start > start)
517				start = vma->vm_start;
518			err = check_pgd_range(vma, start, endvma, nodes,
519						flags, private);
520			if (err) {
521				first = ERR_PTR(err);
522				break;
523			}
524		}
525		prev = vma;
526	}
527	return first;
528}
529
530/* Apply policy to a single VMA */
531static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
532{
533	int err = 0;
534	struct mempolicy *old = vma->vm_policy;
535
536	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
537		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
538		 vma->vm_ops, vma->vm_file,
539		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
540
541	if (vma->vm_ops && vma->vm_ops->set_policy)
542		err = vma->vm_ops->set_policy(vma, new);
543	if (!err) {
544		mpol_get(new);
545		vma->vm_policy = new;
546		mpol_put(old);
547	}
548	return err;
549}
550
551/* Step 2: apply policy to a range and do splits. */
552static int mbind_range(struct vm_area_struct *vma, unsigned long start,
553		       unsigned long end, struct mempolicy *new)
554{
555	struct vm_area_struct *next;
556	int err;
557
558	err = 0;
559	for (; vma && vma->vm_start < end; vma = next) {
560		next = vma->vm_next;
561		if (vma->vm_start < start)
562			err = split_vma(vma->vm_mm, vma, start, 1);
563		if (!err && vma->vm_end > end)
564			err = split_vma(vma->vm_mm, vma, end, 0);
565		if (!err)
566			err = policy_vma(vma, new);
567		if (err)
568			break;
569	}
570	return err;
571}
572
573/*
574 * Update task->flags PF_MEMPOLICY bit: set iff non-default
575 * mempolicy.  Allows more rapid checking of this (combined perhaps
576 * with other PF_* flag bits) on memory allocation hot code paths.
577 *
578 * If called from outside this file, the task 'p' should -only- be
579 * a newly forked child not yet visible on the task list, because
580 * manipulating the task flags of a visible task is not safe.
581 *
582 * The above limitation is why this routine has the funny name
583 * mpol_fix_fork_child_flag().
584 *
585 * It is also safe to call this with a task pointer of current,
586 * which the static wrapper mpol_set_task_struct_flag() does,
587 * for use within this file.
588 */
589
590void mpol_fix_fork_child_flag(struct task_struct *p)
591{
592	if (p->mempolicy)
593		p->flags |= PF_MEMPOLICY;
594	else
595		p->flags &= ~PF_MEMPOLICY;
596}
597
598static void mpol_set_task_struct_flag(void)
599{
600	mpol_fix_fork_child_flag(current);
601}
602
603/* Set the process memory policy */
604static long do_set_mempolicy(unsigned short mode, unsigned short flags,
605			     nodemask_t *nodes)
606{
607	struct mempolicy *new;
608	struct mm_struct *mm = current->mm;
609
610	new = mpol_new(mode, flags, nodes);
611	if (IS_ERR(new))
612		return PTR_ERR(new);
613
614	/*
615	 * prevent changing our mempolicy while show_numa_maps()
616	 * is using it.
617	 * Note:  do_set_mempolicy() can be called at init time
618	 * with no 'mm'.
619	 */
620	if (mm)
621		down_write(&mm->mmap_sem);
622	mpol_put(current->mempolicy);
623	current->mempolicy = new;
624	mpol_set_task_struct_flag();
625	if (new && new->mode == MPOL_INTERLEAVE &&
626	    nodes_weight(new->v.nodes))
627		current->il_next = first_node(new->v.nodes);
628	if (mm)
629		up_write(&mm->mmap_sem);
630
631	return 0;
632}
633
634/*
635 * Return nodemask for policy for get_mempolicy() query
636 */
637static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
638{
639	nodes_clear(*nodes);
640	if (p == &default_policy)
641		return;
642
643	switch (p->mode) {
644	case MPOL_BIND:
645		/* Fall through */
646	case MPOL_INTERLEAVE:
647		*nodes = p->v.nodes;
648		break;
649	case MPOL_PREFERRED:
650		if (!(p->flags & MPOL_F_LOCAL))
651			node_set(p->v.preferred_node, *nodes);
652		/* else return empty node mask for local allocation */
653		break;
654	default:
655		BUG();
656	}
657}
658
659static int lookup_node(struct mm_struct *mm, unsigned long addr)
660{
661	struct page *p;
662	int err;
663
664	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
665	if (err >= 0) {
666		err = page_to_nid(p);
667		put_page(p);
668	}
669	return err;
670}
671
672/* Retrieve NUMA policy */
673static long do_get_mempolicy(int *policy, nodemask_t *nmask,
674			     unsigned long addr, unsigned long flags)
675{
676	int err;
677	struct mm_struct *mm = current->mm;
678	struct vm_area_struct *vma = NULL;
679	struct mempolicy *pol = current->mempolicy;
680
681	cpuset_update_task_memory_state();
682	if (flags &
683		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
684		return -EINVAL;
685
686	if (flags & MPOL_F_MEMS_ALLOWED) {
687		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
688			return -EINVAL;
689		*policy = 0;	/* just so it's initialized */
690		*nmask  = cpuset_current_mems_allowed;
691		return 0;
692	}
693
694	if (flags & MPOL_F_ADDR) {
695		/*
696		 * Do NOT fall back to task policy if the
697		 * vma/shared policy at addr is NULL.  We
698		 * want to return MPOL_DEFAULT in this case.
699		 */
700		down_read(&mm->mmap_sem);
701		vma = find_vma_intersection(mm, addr, addr+1);
702		if (!vma) {
703			up_read(&mm->mmap_sem);
704			return -EFAULT;
705		}
706		if (vma->vm_ops && vma->vm_ops->get_policy)
707			pol = vma->vm_ops->get_policy(vma, addr);
708		else
709			pol = vma->vm_policy;
710	} else if (addr)
711		return -EINVAL;
712
713	if (!pol)
714		pol = &default_policy;	/* indicates default behavior */
715
716	if (flags & MPOL_F_NODE) {
717		if (flags & MPOL_F_ADDR) {
718			err = lookup_node(mm, addr);
719			if (err < 0)
720				goto out;
721			*policy = err;
722		} else if (pol == current->mempolicy &&
723				pol->mode == MPOL_INTERLEAVE) {
724			*policy = current->il_next;
725		} else {
726			err = -EINVAL;
727			goto out;
728		}
729	} else {
730		*policy = pol == &default_policy ? MPOL_DEFAULT :
731						pol->mode;
732		/*
733		 * Internal mempolicy flags must be masked off before exposing
734		 * the policy to userspace.
735		 */
736		*policy |= (pol->flags & MPOL_MODE_FLAGS);
737	}
738
739	if (vma) {
740		up_read(&current->mm->mmap_sem);
741		vma = NULL;
742	}
743
744	err = 0;
745	if (nmask)
746		get_policy_nodemask(pol, nmask);
747
748 out:
749	mpol_cond_put(pol);
750	if (vma)
751		up_read(&current->mm->mmap_sem);
752	return err;
753}
754
755#ifdef CONFIG_MIGRATION
756/*
757 * page migration
758 */
759static void migrate_page_add(struct page *page, struct list_head *pagelist,
760				unsigned long flags)
761{
762	/*
763	 * Avoid migrating a page that is shared with others.
764	 */
765	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
766		isolate_lru_page(page, pagelist);
767}
768
769static struct page *new_node_page(struct page *page, unsigned long node, int **x)
770{
771	return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
772}
773
774/*
775 * Migrate pages from one node to a target node.
776 * Returns error or the number of pages not migrated.
777 */
778static int migrate_to_node(struct mm_struct *mm, int source, int dest,
779			   int flags)
780{
781	nodemask_t nmask;
782	LIST_HEAD(pagelist);
783	int err = 0;
784
785	nodes_clear(nmask);
786	node_set(source, nmask);
787
788	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
789			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
790
791	if (!list_empty(&pagelist))
792		err = migrate_pages(&pagelist, new_node_page, dest);
793
794	return err;
795}
796
797/*
798 * Move pages between the two nodesets so as to preserve the physical
799 * layout as much as possible.
800 *
801 * Returns the number of page that could not be moved.
802 */
803int do_migrate_pages(struct mm_struct *mm,
804	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
805{
806	LIST_HEAD(pagelist);
807	int busy = 0;
808	int err = 0;
809	nodemask_t tmp;
810
811	down_read(&mm->mmap_sem);
812
813	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
814	if (err)
815		goto out;
816
817/*
818 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
819 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
820 * bit in 'tmp', and return that <source, dest> pair for migration.
821 * The pair of nodemasks 'to' and 'from' define the map.
822 *
823 * If no pair of bits is found that way, fallback to picking some
824 * pair of 'source' and 'dest' bits that are not the same.  If the
825 * 'source' and 'dest' bits are the same, this represents a node
826 * that will be migrating to itself, so no pages need move.
827 *
828 * If no bits are left in 'tmp', or if all remaining bits left
829 * in 'tmp' correspond to the same bit in 'to', return false
830 * (nothing left to migrate).
831 *
832 * This lets us pick a pair of nodes to migrate between, such that
833 * if possible the dest node is not already occupied by some other
834 * source node, minimizing the risk of overloading the memory on a
835 * node that would happen if we migrated incoming memory to a node
836 * before migrating outgoing memory source that same node.
837 *
838 * A single scan of tmp is sufficient.  As we go, we remember the
839 * most recent <s, d> pair that moved (s != d).  If we find a pair
840 * that not only moved, but what's better, moved to an empty slot
841 * (d is not set in tmp), then we break out then, with that pair.
842 * Otherwise when we finish scannng from_tmp, we at least have the
843 * most recent <s, d> pair that moved.  If we get all the way through
844 * the scan of tmp without finding any node that moved, much less
845 * moved to an empty node, then there is nothing left worth migrating.
846 */
847
848	tmp = *from_nodes;
849	while (!nodes_empty(tmp)) {
850		int s,d;
851		int source = -1;
852		int dest = 0;
853
854		for_each_node_mask(s, tmp) {
855			d = node_remap(s, *from_nodes, *to_nodes);
856			if (s == d)
857				continue;
858
859			source = s;	/* Node moved. Memorize */
860			dest = d;
861
862			/* dest not in remaining from nodes? */
863			if (!node_isset(dest, tmp))
864				break;
865		}
866		if (source == -1)
867			break;
868
869		node_clear(source, tmp);
870		err = migrate_to_node(mm, source, dest, flags);
871		if (err > 0)
872			busy += err;
873		if (err < 0)
874			break;
875	}
876out:
877	up_read(&mm->mmap_sem);
878	if (err < 0)
879		return err;
880	return busy;
881
882}
883
884/*
885 * Allocate a new page for page migration based on vma policy.
886 * Start assuming that page is mapped by vma pointed to by @private.
887 * Search forward from there, if not.  N.B., this assumes that the
888 * list of pages handed to migrate_pages()--which is how we get here--
889 * is in virtual address order.
890 */
891static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
892{
893	struct vm_area_struct *vma = (struct vm_area_struct *)private;
894	unsigned long uninitialized_var(address);
895
896	while (vma) {
897		address = page_address_in_vma(page, vma);
898		if (address != -EFAULT)
899			break;
900		vma = vma->vm_next;
901	}
902
903	/*
904	 * if !vma, alloc_page_vma() will use task or system default policy
905	 */
906	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
907}
908#else
909
910static void migrate_page_add(struct page *page, struct list_head *pagelist,
911				unsigned long flags)
912{
913}
914
915int do_migrate_pages(struct mm_struct *mm,
916	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
917{
918	return -ENOSYS;
919}
920
921static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
922{
923	return NULL;
924}
925#endif
926
927static long do_mbind(unsigned long start, unsigned long len,
928		     unsigned short mode, unsigned short mode_flags,
929		     nodemask_t *nmask, unsigned long flags)
930{
931	struct vm_area_struct *vma;
932	struct mm_struct *mm = current->mm;
933	struct mempolicy *new;
934	unsigned long end;
935	int err;
936	LIST_HEAD(pagelist);
937
938	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
939				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
940		return -EINVAL;
941	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
942		return -EPERM;
943
944	if (start & ~PAGE_MASK)
945		return -EINVAL;
946
947	if (mode == MPOL_DEFAULT)
948		flags &= ~MPOL_MF_STRICT;
949
950	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
951	end = start + len;
952
953	if (end < start)
954		return -EINVAL;
955	if (end == start)
956		return 0;
957
958	new = mpol_new(mode, mode_flags, nmask);
959	if (IS_ERR(new))
960		return PTR_ERR(new);
961
962	/*
963	 * If we are using the default policy then operation
964	 * on discontinuous address spaces is okay after all
965	 */
966	if (!new)
967		flags |= MPOL_MF_DISCONTIG_OK;
968
969	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
970		 start, start + len, mode, mode_flags,
971		 nmask ? nodes_addr(*nmask)[0] : -1);
972
973	down_write(&mm->mmap_sem);
974	vma = check_range(mm, start, end, nmask,
975			  flags | MPOL_MF_INVERT, &pagelist);
976
977	err = PTR_ERR(vma);
978	if (!IS_ERR(vma)) {
979		int nr_failed = 0;
980
981		err = mbind_range(vma, start, end, new);
982
983		if (!list_empty(&pagelist))
984			nr_failed = migrate_pages(&pagelist, new_vma_page,
985						(unsigned long)vma);
986
987		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
988			err = -EIO;
989	}
990
991	up_write(&mm->mmap_sem);
992	mpol_put(new);
993	return err;
994}
995
996/*
997 * User space interface with variable sized bitmaps for nodelists.
998 */
999
1000/* Copy a node mask from user space. */
1001static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1002		     unsigned long maxnode)
1003{
1004	unsigned long k;
1005	unsigned long nlongs;
1006	unsigned long endmask;
1007
1008	--maxnode;
1009	nodes_clear(*nodes);
1010	if (maxnode == 0 || !nmask)
1011		return 0;
1012	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1013		return -EINVAL;
1014
1015	nlongs = BITS_TO_LONGS(maxnode);
1016	if ((maxnode % BITS_PER_LONG) == 0)
1017		endmask = ~0UL;
1018	else
1019		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1020
1021	/* When the user specified more nodes than supported just check
1022	   if the non supported part is all zero. */
1023	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1024		if (nlongs > PAGE_SIZE/sizeof(long))
1025			return -EINVAL;
1026		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1027			unsigned long t;
1028			if (get_user(t, nmask + k))
1029				return -EFAULT;
1030			if (k == nlongs - 1) {
1031				if (t & endmask)
1032					return -EINVAL;
1033			} else if (t)
1034				return -EINVAL;
1035		}
1036		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1037		endmask = ~0UL;
1038	}
1039
1040	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1041		return -EFAULT;
1042	nodes_addr(*nodes)[nlongs-1] &= endmask;
1043	return 0;
1044}
1045
1046/* Copy a kernel node mask to user space */
1047static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1048			      nodemask_t *nodes)
1049{
1050	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1051	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1052
1053	if (copy > nbytes) {
1054		if (copy > PAGE_SIZE)
1055			return -EINVAL;
1056		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1057			return -EFAULT;
1058		copy = nbytes;
1059	}
1060	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1061}
1062
1063asmlinkage long sys_mbind(unsigned long start, unsigned long len,
1064			unsigned long mode,
1065			unsigned long __user *nmask, unsigned long maxnode,
1066			unsigned flags)
1067{
1068	nodemask_t nodes;
1069	int err;
1070	unsigned short mode_flags;
1071
1072	mode_flags = mode & MPOL_MODE_FLAGS;
1073	mode &= ~MPOL_MODE_FLAGS;
1074	if (mode >= MPOL_MAX)
1075		return -EINVAL;
1076	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1077	    (mode_flags & MPOL_F_RELATIVE_NODES))
1078		return -EINVAL;
1079	err = get_nodes(&nodes, nmask, maxnode);
1080	if (err)
1081		return err;
1082	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1083}
1084
1085/* Set the process memory policy */
1086asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
1087		unsigned long maxnode)
1088{
1089	int err;
1090	nodemask_t nodes;
1091	unsigned short flags;
1092
1093	flags = mode & MPOL_MODE_FLAGS;
1094	mode &= ~MPOL_MODE_FLAGS;
1095	if ((unsigned int)mode >= MPOL_MAX)
1096		return -EINVAL;
1097	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1098		return -EINVAL;
1099	err = get_nodes(&nodes, nmask, maxnode);
1100	if (err)
1101		return err;
1102	return do_set_mempolicy(mode, flags, &nodes);
1103}
1104
1105asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
1106		const unsigned long __user *old_nodes,
1107		const unsigned long __user *new_nodes)
1108{
1109	struct mm_struct *mm;
1110	struct task_struct *task;
1111	nodemask_t old;
1112	nodemask_t new;
1113	nodemask_t task_nodes;
1114	int err;
1115
1116	err = get_nodes(&old, old_nodes, maxnode);
1117	if (err)
1118		return err;
1119
1120	err = get_nodes(&new, new_nodes, maxnode);
1121	if (err)
1122		return err;
1123
1124	/* Find the mm_struct */
1125	read_lock(&tasklist_lock);
1126	task = pid ? find_task_by_vpid(pid) : current;
1127	if (!task) {
1128		read_unlock(&tasklist_lock);
1129		return -ESRCH;
1130	}
1131	mm = get_task_mm(task);
1132	read_unlock(&tasklist_lock);
1133
1134	if (!mm)
1135		return -EINVAL;
1136
1137	/*
1138	 * Check if this process has the right to modify the specified
1139	 * process. The right exists if the process has administrative
1140	 * capabilities, superuser privileges or the same
1141	 * userid as the target process.
1142	 */
1143	if ((current->euid != task->suid) && (current->euid != task->uid) &&
1144	    (current->uid != task->suid) && (current->uid != task->uid) &&
1145	    !capable(CAP_SYS_NICE)) {
1146		err = -EPERM;
1147		goto out;
1148	}
1149
1150	task_nodes = cpuset_mems_allowed(task);
1151	/* Is the user allowed to access the target nodes? */
1152	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1153		err = -EPERM;
1154		goto out;
1155	}
1156
1157	if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1158		err = -EINVAL;
1159		goto out;
1160	}
1161
1162	err = security_task_movememory(task);
1163	if (err)
1164		goto out;
1165
1166	err = do_migrate_pages(mm, &old, &new,
1167		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1168out:
1169	mmput(mm);
1170	return err;
1171}
1172
1173
1174/* Retrieve NUMA policy */
1175asmlinkage long sys_get_mempolicy(int __user *policy,
1176				unsigned long __user *nmask,
1177				unsigned long maxnode,
1178				unsigned long addr, unsigned long flags)
1179{
1180	int err;
1181	int uninitialized_var(pval);
1182	nodemask_t nodes;
1183
1184	if (nmask != NULL && maxnode < MAX_NUMNODES)
1185		return -EINVAL;
1186
1187	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1188
1189	if (err)
1190		return err;
1191
1192	if (policy && put_user(pval, policy))
1193		return -EFAULT;
1194
1195	if (nmask)
1196		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1197
1198	return err;
1199}
1200
1201#ifdef CONFIG_COMPAT
1202
1203asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1204				     compat_ulong_t __user *nmask,
1205				     compat_ulong_t maxnode,
1206				     compat_ulong_t addr, compat_ulong_t flags)
1207{
1208	long err;
1209	unsigned long __user *nm = NULL;
1210	unsigned long nr_bits, alloc_size;
1211	DECLARE_BITMAP(bm, MAX_NUMNODES);
1212
1213	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1214	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1215
1216	if (nmask)
1217		nm = compat_alloc_user_space(alloc_size);
1218
1219	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1220
1221	if (!err && nmask) {
1222		err = copy_from_user(bm, nm, alloc_size);
1223		/* ensure entire bitmap is zeroed */
1224		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1225		err |= compat_put_bitmap(nmask, bm, nr_bits);
1226	}
1227
1228	return err;
1229}
1230
1231asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1232				     compat_ulong_t maxnode)
1233{
1234	long err = 0;
1235	unsigned long __user *nm = NULL;
1236	unsigned long nr_bits, alloc_size;
1237	DECLARE_BITMAP(bm, MAX_NUMNODES);
1238
1239	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1240	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1241
1242	if (nmask) {
1243		err = compat_get_bitmap(bm, nmask, nr_bits);
1244		nm = compat_alloc_user_space(alloc_size);
1245		err |= copy_to_user(nm, bm, alloc_size);
1246	}
1247
1248	if (err)
1249		return -EFAULT;
1250
1251	return sys_set_mempolicy(mode, nm, nr_bits+1);
1252}
1253
1254asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1255			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1256			     compat_ulong_t maxnode, compat_ulong_t flags)
1257{
1258	long err = 0;
1259	unsigned long __user *nm = NULL;
1260	unsigned long nr_bits, alloc_size;
1261	nodemask_t bm;
1262
1263	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1264	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1265
1266	if (nmask) {
1267		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1268		nm = compat_alloc_user_space(alloc_size);
1269		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1270	}
1271
1272	if (err)
1273		return -EFAULT;
1274
1275	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1276}
1277
1278#endif
1279
1280/*
1281 * get_vma_policy(@task, @vma, @addr)
1282 * @task - task for fallback if vma policy == default
1283 * @vma   - virtual memory area whose policy is sought
1284 * @addr  - address in @vma for shared policy lookup
1285 *
1286 * Returns effective policy for a VMA at specified address.
1287 * Falls back to @task or system default policy, as necessary.
1288 * Current or other task's task mempolicy and non-shared vma policies
1289 * are protected by the task's mmap_sem, which must be held for read by
1290 * the caller.
1291 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1292 * count--added by the get_policy() vm_op, as appropriate--to protect against
1293 * freeing by another task.  It is the caller's responsibility to free the
1294 * extra reference for shared policies.
1295 */
1296static struct mempolicy *get_vma_policy(struct task_struct *task,
1297		struct vm_area_struct *vma, unsigned long addr)
1298{
1299	struct mempolicy *pol = task->mempolicy;
1300
1301	if (vma) {
1302		if (vma->vm_ops && vma->vm_ops->get_policy) {
1303			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1304									addr);
1305			if (vpol)
1306				pol = vpol;
1307		} else if (vma->vm_policy)
1308			pol = vma->vm_policy;
1309	}
1310	if (!pol)
1311		pol = &default_policy;
1312	return pol;
1313}
1314
1315/*
1316 * Return a nodemask representing a mempolicy for filtering nodes for
1317 * page allocation
1318 */
1319static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1320{
1321	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1322	if (unlikely(policy->mode == MPOL_BIND) &&
1323			gfp_zone(gfp) >= policy_zone &&
1324			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1325		return &policy->v.nodes;
1326
1327	return NULL;
1328}
1329
1330/* Return a zonelist indicated by gfp for node representing a mempolicy */
1331static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1332{
1333	int nd = numa_node_id();
1334
1335	switch (policy->mode) {
1336	case MPOL_PREFERRED:
1337		if (!(policy->flags & MPOL_F_LOCAL))
1338			nd = policy->v.preferred_node;
1339		break;
1340	case MPOL_BIND:
1341		/*
1342		 * Normally, MPOL_BIND allocations are node-local within the
1343		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1344		 * current node is part of the mask, we use the zonelist for
1345		 * the first node in the mask instead.
1346		 */
1347		if (unlikely(gfp & __GFP_THISNODE) &&
1348				unlikely(!node_isset(nd, policy->v.nodes)))
1349			nd = first_node(policy->v.nodes);
1350		break;
1351	case MPOL_INTERLEAVE: /* should not happen */
1352		break;
1353	default:
1354		BUG();
1355	}
1356	return node_zonelist(nd, gfp);
1357}
1358
1359/* Do dynamic interleaving for a process */
1360static unsigned interleave_nodes(struct mempolicy *policy)
1361{
1362	unsigned nid, next;
1363	struct task_struct *me = current;
1364
1365	nid = me->il_next;
1366	next = next_node(nid, policy->v.nodes);
1367	if (next >= MAX_NUMNODES)
1368		next = first_node(policy->v.nodes);
1369	if (next < MAX_NUMNODES)
1370		me->il_next = next;
1371	return nid;
1372}
1373
1374/*
1375 * Depending on the memory policy provide a node from which to allocate the
1376 * next slab entry.
1377 * @policy must be protected by freeing by the caller.  If @policy is
1378 * the current task's mempolicy, this protection is implicit, as only the
1379 * task can change it's policy.  The system default policy requires no
1380 * such protection.
1381 */
1382unsigned slab_node(struct mempolicy *policy)
1383{
1384	if (!policy || policy->flags & MPOL_F_LOCAL)
1385		return numa_node_id();
1386
1387	switch (policy->mode) {
1388	case MPOL_PREFERRED:
1389		/*
1390		 * handled MPOL_F_LOCAL above
1391		 */
1392		return policy->v.preferred_node;
1393
1394	case MPOL_INTERLEAVE:
1395		return interleave_nodes(policy);
1396
1397	case MPOL_BIND: {
1398		/*
1399		 * Follow bind policy behavior and start allocation at the
1400		 * first node.
1401		 */
1402		struct zonelist *zonelist;
1403		struct zone *zone;
1404		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1405		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1406		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1407							&policy->v.nodes,
1408							&zone);
1409		return zone->node;
1410	}
1411
1412	default:
1413		BUG();
1414	}
1415}
1416
1417/* Do static interleaving for a VMA with known offset. */
1418static unsigned offset_il_node(struct mempolicy *pol,
1419		struct vm_area_struct *vma, unsigned long off)
1420{
1421	unsigned nnodes = nodes_weight(pol->v.nodes);
1422	unsigned target;
1423	int c;
1424	int nid = -1;
1425
1426	if (!nnodes)
1427		return numa_node_id();
1428	target = (unsigned int)off % nnodes;
1429	c = 0;
1430	do {
1431		nid = next_node(nid, pol->v.nodes);
1432		c++;
1433	} while (c <= target);
1434	return nid;
1435}
1436
1437/* Determine a node number for interleave */
1438static inline unsigned interleave_nid(struct mempolicy *pol,
1439		 struct vm_area_struct *vma, unsigned long addr, int shift)
1440{
1441	if (vma) {
1442		unsigned long off;
1443
1444		/*
1445		 * for small pages, there is no difference between
1446		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1447		 * for huge pages, since vm_pgoff is in units of small
1448		 * pages, we need to shift off the always 0 bits to get
1449		 * a useful offset.
1450		 */
1451		BUG_ON(shift < PAGE_SHIFT);
1452		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1453		off += (addr - vma->vm_start) >> shift;
1454		return offset_il_node(pol, vma, off);
1455	} else
1456		return interleave_nodes(pol);
1457}
1458
1459#ifdef CONFIG_HUGETLBFS
1460/*
1461 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1462 * @vma = virtual memory area whose policy is sought
1463 * @addr = address in @vma for shared policy lookup and interleave policy
1464 * @gfp_flags = for requested zone
1465 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1466 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1467 *
1468 * Returns a zonelist suitable for a huge page allocation and a pointer
1469 * to the struct mempolicy for conditional unref after allocation.
1470 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1471 * @nodemask for filtering the zonelist.
1472 */
1473struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1474				gfp_t gfp_flags, struct mempolicy **mpol,
1475				nodemask_t **nodemask)
1476{
1477	struct zonelist *zl;
1478
1479	*mpol = get_vma_policy(current, vma, addr);
1480	*nodemask = NULL;	/* assume !MPOL_BIND */
1481
1482	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1483		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1484						HPAGE_SHIFT), gfp_flags);
1485	} else {
1486		zl = policy_zonelist(gfp_flags, *mpol);
1487		if ((*mpol)->mode == MPOL_BIND)
1488			*nodemask = &(*mpol)->v.nodes;
1489	}
1490	return zl;
1491}
1492#endif
1493
1494/* Allocate a page in interleaved policy.
1495   Own path because it needs to do special accounting. */
1496static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1497					unsigned nid)
1498{
1499	struct zonelist *zl;
1500	struct page *page;
1501
1502	zl = node_zonelist(nid, gfp);
1503	page = __alloc_pages(gfp, order, zl);
1504	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1505		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1506	return page;
1507}
1508
1509/**
1510 * 	alloc_page_vma	- Allocate a page for a VMA.
1511 *
1512 * 	@gfp:
1513 *      %GFP_USER    user allocation.
1514 *      %GFP_KERNEL  kernel allocations,
1515 *      %GFP_HIGHMEM highmem/user allocations,
1516 *      %GFP_FS      allocation should not call back into a file system.
1517 *      %GFP_ATOMIC  don't sleep.
1518 *
1519 * 	@vma:  Pointer to VMA or NULL if not available.
1520 *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1521 *
1522 * 	This function allocates a page from the kernel page pool and applies
1523 *	a NUMA policy associated with the VMA or the current process.
1524 *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1525 *	mm_struct of the VMA to prevent it from going away. Should be used for
1526 *	all allocations for pages that will be mapped into
1527 * 	user space. Returns NULL when no page can be allocated.
1528 *
1529 *	Should be called with the mm_sem of the vma hold.
1530 */
1531struct page *
1532alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1533{
1534	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1535	struct zonelist *zl;
1536
1537	cpuset_update_task_memory_state();
1538
1539	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1540		unsigned nid;
1541
1542		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1543		mpol_cond_put(pol);
1544		return alloc_page_interleave(gfp, 0, nid);
1545	}
1546	zl = policy_zonelist(gfp, pol);
1547	if (unlikely(mpol_needs_cond_ref(pol))) {
1548		/*
1549		 * slow path: ref counted shared policy
1550		 */
1551		struct page *page =  __alloc_pages_nodemask(gfp, 0,
1552						zl, policy_nodemask(gfp, pol));
1553		__mpol_put(pol);
1554		return page;
1555	}
1556	/*
1557	 * fast path:  default or task policy
1558	 */
1559	return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1560}
1561
1562/**
1563 * 	alloc_pages_current - Allocate pages.
1564 *
1565 *	@gfp:
1566 *		%GFP_USER   user allocation,
1567 *      	%GFP_KERNEL kernel allocation,
1568 *      	%GFP_HIGHMEM highmem allocation,
1569 *      	%GFP_FS     don't call back into a file system.
1570 *      	%GFP_ATOMIC don't sleep.
1571 *	@order: Power of two of allocation size in pages. 0 is a single page.
1572 *
1573 *	Allocate a page from the kernel page pool.  When not in
1574 *	interrupt context and apply the current process NUMA policy.
1575 *	Returns NULL when no page can be allocated.
1576 *
1577 *	Don't call cpuset_update_task_memory_state() unless
1578 *	1) it's ok to take cpuset_sem (can WAIT), and
1579 *	2) allocating for current task (not interrupt).
1580 */
1581struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1582{
1583	struct mempolicy *pol = current->mempolicy;
1584
1585	if ((gfp & __GFP_WAIT) && !in_interrupt())
1586		cpuset_update_task_memory_state();
1587	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1588		pol = &default_policy;
1589
1590	/*
1591	 * No reference counting needed for current->mempolicy
1592	 * nor system default_policy
1593	 */
1594	if (pol->mode == MPOL_INTERLEAVE)
1595		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1596	return __alloc_pages_nodemask(gfp, order,
1597			policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1598}
1599EXPORT_SYMBOL(alloc_pages_current);
1600
1601/*
1602 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1603 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1604 * with the mems_allowed returned by cpuset_mems_allowed().  This
1605 * keeps mempolicies cpuset relative after its cpuset moves.  See
1606 * further kernel/cpuset.c update_nodemask().
1607 */
1608
1609/* Slow path of a mempolicy duplicate */
1610struct mempolicy *__mpol_dup(struct mempolicy *old)
1611{
1612	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1613
1614	if (!new)
1615		return ERR_PTR(-ENOMEM);
1616	if (current_cpuset_is_being_rebound()) {
1617		nodemask_t mems = cpuset_mems_allowed(current);
1618		mpol_rebind_policy(old, &mems);
1619	}
1620	*new = *old;
1621	atomic_set(&new->refcnt, 1);
1622	return new;
1623}
1624
1625/*
1626 * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1627 * eliminate the * MPOL_F_* flags that require conditional ref and
1628 * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1629 * after return.  Use the returned value.
1630 *
1631 * Allows use of a mempolicy for, e.g., multiple allocations with a single
1632 * policy lookup, even if the policy needs/has extra ref on lookup.
1633 * shmem_readahead needs this.
1634 */
1635struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1636						struct mempolicy *frompol)
1637{
1638	if (!mpol_needs_cond_ref(frompol))
1639		return frompol;
1640
1641	*tompol = *frompol;
1642	tompol->flags &= ~MPOL_F_SHARED;	/* copy doesn't need unref */
1643	__mpol_put(frompol);
1644	return tompol;
1645}
1646
1647static int mpol_match_intent(const struct mempolicy *a,
1648			     const struct mempolicy *b)
1649{
1650	if (a->flags != b->flags)
1651		return 0;
1652	if (!mpol_store_user_nodemask(a))
1653		return 1;
1654	return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1655}
1656
1657/* Slow path of a mempolicy comparison */
1658int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1659{
1660	if (!a || !b)
1661		return 0;
1662	if (a->mode != b->mode)
1663		return 0;
1664	if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1665		return 0;
1666	switch (a->mode) {
1667	case MPOL_BIND:
1668		/* Fall through */
1669	case MPOL_INTERLEAVE:
1670		return nodes_equal(a->v.nodes, b->v.nodes);
1671	case MPOL_PREFERRED:
1672		return a->v.preferred_node == b->v.preferred_node &&
1673			a->flags == b->flags;
1674	default:
1675		BUG();
1676		return 0;
1677	}
1678}
1679
1680/*
1681 * Shared memory backing store policy support.
1682 *
1683 * Remember policies even when nobody has shared memory mapped.
1684 * The policies are kept in Red-Black tree linked from the inode.
1685 * They are protected by the sp->lock spinlock, which should be held
1686 * for any accesses to the tree.
1687 */
1688
1689/* lookup first element intersecting start-end */
1690/* Caller holds sp->lock */
1691static struct sp_node *
1692sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1693{
1694	struct rb_node *n = sp->root.rb_node;
1695
1696	while (n) {
1697		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1698
1699		if (start >= p->end)
1700			n = n->rb_right;
1701		else if (end <= p->start)
1702			n = n->rb_left;
1703		else
1704			break;
1705	}
1706	if (!n)
1707		return NULL;
1708	for (;;) {
1709		struct sp_node *w = NULL;
1710		struct rb_node *prev = rb_prev(n);
1711		if (!prev)
1712			break;
1713		w = rb_entry(prev, struct sp_node, nd);
1714		if (w->end <= start)
1715			break;
1716		n = prev;
1717	}
1718	return rb_entry(n, struct sp_node, nd);
1719}
1720
1721/* Insert a new shared policy into the list. */
1722/* Caller holds sp->lock */
1723static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1724{
1725	struct rb_node **p = &sp->root.rb_node;
1726	struct rb_node *parent = NULL;
1727	struct sp_node *nd;
1728
1729	while (*p) {
1730		parent = *p;
1731		nd = rb_entry(parent, struct sp_node, nd);
1732		if (new->start < nd->start)
1733			p = &(*p)->rb_left;
1734		else if (new->end > nd->end)
1735			p = &(*p)->rb_right;
1736		else
1737			BUG();
1738	}
1739	rb_link_node(&new->nd, parent, p);
1740	rb_insert_color(&new->nd, &sp->root);
1741	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1742		 new->policy ? new->policy->mode : 0);
1743}
1744
1745/* Find shared policy intersecting idx */
1746struct mempolicy *
1747mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1748{
1749	struct mempolicy *pol = NULL;
1750	struct sp_node *sn;
1751
1752	if (!sp->root.rb_node)
1753		return NULL;
1754	spin_lock(&sp->lock);
1755	sn = sp_lookup(sp, idx, idx+1);
1756	if (sn) {
1757		mpol_get(sn->policy);
1758		pol = sn->policy;
1759	}
1760	spin_unlock(&sp->lock);
1761	return pol;
1762}
1763
1764static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1765{
1766	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1767	rb_erase(&n->nd, &sp->root);
1768	mpol_put(n->policy);
1769	kmem_cache_free(sn_cache, n);
1770}
1771
1772static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1773				struct mempolicy *pol)
1774{
1775	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1776
1777	if (!n)
1778		return NULL;
1779	n->start = start;
1780	n->end = end;
1781	mpol_get(pol);
1782	pol->flags |= MPOL_F_SHARED;	/* for unref */
1783	n->policy = pol;
1784	return n;
1785}
1786
1787/* Replace a policy range. */
1788static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1789				 unsigned long end, struct sp_node *new)
1790{
1791	struct sp_node *n, *new2 = NULL;
1792
1793restart:
1794	spin_lock(&sp->lock);
1795	n = sp_lookup(sp, start, end);
1796	/* Take care of old policies in the same range. */
1797	while (n && n->start < end) {
1798		struct rb_node *next = rb_next(&n->nd);
1799		if (n->start >= start) {
1800			if (n->end <= end)
1801				sp_delete(sp, n);
1802			else
1803				n->start = end;
1804		} else {
1805			/* Old policy spanning whole new range. */
1806			if (n->end > end) {
1807				if (!new2) {
1808					spin_unlock(&sp->lock);
1809					new2 = sp_alloc(end, n->end, n->policy);
1810					if (!new2)
1811						return -ENOMEM;
1812					goto restart;
1813				}
1814				n->end = start;
1815				sp_insert(sp, new2);
1816				new2 = NULL;
1817				break;
1818			} else
1819				n->end = start;
1820		}
1821		if (!next)
1822			break;
1823		n = rb_entry(next, struct sp_node, nd);
1824	}
1825	if (new)
1826		sp_insert(sp, new);
1827	spin_unlock(&sp->lock);
1828	if (new2) {
1829		mpol_put(new2->policy);
1830		kmem_cache_free(sn_cache, new2);
1831	}
1832	return 0;
1833}
1834
1835/**
1836 * mpol_shared_policy_init - initialize shared policy for inode
1837 * @sp: pointer to inode shared policy
1838 * @mpol:  struct mempolicy to install
1839 *
1840 * Install non-NULL @mpol in inode's shared policy rb-tree.
1841 * On entry, the current task has a reference on a non-NULL @mpol.
1842 * This must be released on exit.
1843 */
1844void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1845{
1846	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
1847	spin_lock_init(&sp->lock);
1848
1849	if (mpol) {
1850		struct vm_area_struct pvma;
1851		struct mempolicy *new;
1852
1853		/* contextualize the tmpfs mount point mempolicy */
1854		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1855		mpol_put(mpol);	/* drop our ref on sb mpol */
1856		if (IS_ERR(new))
1857			return;		/* no valid nodemask intersection */
1858
1859		/* Create pseudo-vma that contains just the policy */
1860		memset(&pvma, 0, sizeof(struct vm_area_struct));
1861		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
1862		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
1863		mpol_put(new);			/* drop initial ref */
1864	}
1865}
1866
1867int mpol_set_shared_policy(struct shared_policy *info,
1868			struct vm_area_struct *vma, struct mempolicy *npol)
1869{
1870	int err;
1871	struct sp_node *new = NULL;
1872	unsigned long sz = vma_pages(vma);
1873
1874	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1875		 vma->vm_pgoff,
1876		 sz, npol ? npol->mode : -1,
1877		 npol ? npol->flags : -1,
1878		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1879
1880	if (npol) {
1881		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1882		if (!new)
1883			return -ENOMEM;
1884	}
1885	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1886	if (err && new)
1887		kmem_cache_free(sn_cache, new);
1888	return err;
1889}
1890
1891/* Free a backing policy store on inode delete. */
1892void mpol_free_shared_policy(struct shared_policy *p)
1893{
1894	struct sp_node *n;
1895	struct rb_node *next;
1896
1897	if (!p->root.rb_node)
1898		return;
1899	spin_lock(&p->lock);
1900	next = rb_first(&p->root);
1901	while (next) {
1902		n = rb_entry(next, struct sp_node, nd);
1903		next = rb_next(&n->nd);
1904		rb_erase(&n->nd, &p->root);
1905		mpol_put(n->policy);
1906		kmem_cache_free(sn_cache, n);
1907	}
1908	spin_unlock(&p->lock);
1909}
1910
1911/* assumes fs == KERNEL_DS */
1912void __init numa_policy_init(void)
1913{
1914	nodemask_t interleave_nodes;
1915	unsigned long largest = 0;
1916	int nid, prefer = 0;
1917
1918	policy_cache = kmem_cache_create("numa_policy",
1919					 sizeof(struct mempolicy),
1920					 0, SLAB_PANIC, NULL);
1921
1922	sn_cache = kmem_cache_create("shared_policy_node",
1923				     sizeof(struct sp_node),
1924				     0, SLAB_PANIC, NULL);
1925
1926	/*
1927	 * Set interleaving policy for system init. Interleaving is only
1928	 * enabled across suitably sized nodes (default is >= 16MB), or
1929	 * fall back to the largest node if they're all smaller.
1930	 */
1931	nodes_clear(interleave_nodes);
1932	for_each_node_state(nid, N_HIGH_MEMORY) {
1933		unsigned long total_pages = node_present_pages(nid);
1934
1935		/* Preserve the largest node */
1936		if (largest < total_pages) {
1937			largest = total_pages;
1938			prefer = nid;
1939		}
1940
1941		/* Interleave this node? */
1942		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1943			node_set(nid, interleave_nodes);
1944	}
1945
1946	/* All too small, use the largest */
1947	if (unlikely(nodes_empty(interleave_nodes)))
1948		node_set(prefer, interleave_nodes);
1949
1950	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1951		printk("numa_policy_init: interleaving failed\n");
1952}
1953
1954/* Reset policy of current process to default */
1955void numa_default_policy(void)
1956{
1957	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1958}
1959
1960/*
1961 * Parse and format mempolicy from/to strings
1962 */
1963
1964/*
1965 * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
1966 * Used only for mpol_parse_str() and mpol_to_str()
1967 */
1968#define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
1969static const char * const policy_types[] =
1970	{ "default", "prefer", "bind", "interleave", "local" };
1971
1972
1973#ifdef CONFIG_TMPFS
1974/**
1975 * mpol_parse_str - parse string to mempolicy
1976 * @str:  string containing mempolicy to parse
1977 * @mpol:  pointer to struct mempolicy pointer, returned on success.
1978 * @no_context:  flag whether to "contextualize" the mempolicy
1979 *
1980 * Format of input:
1981 *	<mode>[=<flags>][:<nodelist>]
1982 *
1983 * if @no_context is true, save the input nodemask in w.user_nodemask in
1984 * the returned mempolicy.  This will be used to "clone" the mempolicy in
1985 * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
1986 * mount option.  Note that if 'static' or 'relative' mode flags were
1987 * specified, the input nodemask will already have been saved.  Saving
1988 * it again is redundant, but safe.
1989 *
1990 * On success, returns 0, else 1
1991 */
1992int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
1993{
1994	struct mempolicy *new = NULL;
1995	unsigned short uninitialized_var(mode);
1996	unsigned short uninitialized_var(mode_flags);
1997	nodemask_t nodes;
1998	char *nodelist = strchr(str, ':');
1999	char *flags = strchr(str, '=');
2000	int i;
2001	int err = 1;
2002
2003	if (nodelist) {
2004		/* NUL-terminate mode or flags string */
2005		*nodelist++ = '\0';
2006		if (nodelist_parse(nodelist, nodes))
2007			goto out;
2008		if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2009			goto out;
2010	} else
2011		nodes_clear(nodes);
2012
2013	if (flags)
2014		*flags++ = '\0';	/* terminate mode string */
2015
2016	for (i = 0; i <= MPOL_LOCAL; i++) {
2017		if (!strcmp(str, policy_types[i])) {
2018			mode = i;
2019			break;
2020		}
2021	}
2022	if (i > MPOL_LOCAL)
2023		goto out;
2024
2025	switch (mode) {
2026	case MPOL_PREFERRED:
2027		/*
2028		 * Insist on a nodelist of one node only
2029		 */
2030		if (nodelist) {
2031			char *rest = nodelist;
2032			while (isdigit(*rest))
2033				rest++;
2034			if (!*rest)
2035				err = 0;
2036		}
2037		break;
2038	case MPOL_INTERLEAVE:
2039		/*
2040		 * Default to online nodes with memory if no nodelist
2041		 */
2042		if (!nodelist)
2043			nodes = node_states[N_HIGH_MEMORY];
2044		err = 0;
2045		break;
2046	case MPOL_LOCAL:
2047		/*
2048		 * Don't allow a nodelist;  mpol_new() checks flags
2049		 */
2050		if (nodelist)
2051			goto out;
2052		mode = MPOL_PREFERRED;
2053		break;
2054
2055	/*
2056	 * case MPOL_BIND:    mpol_new() enforces non-empty nodemask.
2057	 * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
2058	 */
2059	}
2060
2061	mode_flags = 0;
2062	if (flags) {
2063		/*
2064		 * Currently, we only support two mutually exclusive
2065		 * mode flags.
2066		 */
2067		if (!strcmp(flags, "static"))
2068			mode_flags |= MPOL_F_STATIC_NODES;
2069		else if (!strcmp(flags, "relative"))
2070			mode_flags |= MPOL_F_RELATIVE_NODES;
2071		else
2072			err = 1;
2073	}
2074
2075	new = mpol_new(mode, mode_flags, &nodes);
2076	if (IS_ERR(new))
2077		err = 1;
2078	else if (no_context)
2079		new->w.user_nodemask = nodes;	/* save for contextualization */
2080
2081out:
2082	/* Restore string for error message */
2083	if (nodelist)
2084		*--nodelist = ':';
2085	if (flags)
2086		*--flags = '=';
2087	if (!err)
2088		*mpol = new;
2089	return err;
2090}
2091#endif /* CONFIG_TMPFS */
2092
2093/**
2094 * mpol_to_str - format a mempolicy structure for printing
2095 * @buffer:  to contain formatted mempolicy string
2096 * @maxlen:  length of @buffer
2097 * @pol:  pointer to mempolicy to be formatted
2098 * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2099 *
2100 * Convert a mempolicy into a string.
2101 * Returns the number of characters in buffer (if positive)
2102 * or an error (negative)
2103 */
2104int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2105{
2106	char *p = buffer;
2107	int l;
2108	nodemask_t nodes;
2109	unsigned short mode;
2110	unsigned short flags = pol ? pol->flags : 0;
2111
2112	/*
2113	 * Sanity check:  room for longest mode, flag and some nodes
2114	 */
2115	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2116
2117	if (!pol || pol == &default_policy)
2118		mode = MPOL_DEFAULT;
2119	else
2120		mode = pol->mode;
2121
2122	switch (mode) {
2123	case MPOL_DEFAULT:
2124		nodes_clear(nodes);
2125		break;
2126
2127	case MPOL_PREFERRED:
2128		nodes_clear(nodes);
2129		if (flags & MPOL_F_LOCAL)
2130			mode = MPOL_LOCAL;	/* pseudo-policy */
2131		else
2132			node_set(pol->v.preferred_node, nodes);
2133		break;
2134
2135	case MPOL_BIND:
2136		/* Fall through */
2137	case MPOL_INTERLEAVE:
2138		if (no_context)
2139			nodes = pol->w.user_nodemask;
2140		else
2141			nodes = pol->v.nodes;
2142		break;
2143
2144	default:
2145		BUG();
2146	}
2147
2148	l = strlen(policy_types[mode]);
2149	if (buffer + maxlen < p + l + 1)
2150		return -ENOSPC;
2151
2152	strcpy(p, policy_types[mode]);
2153	p += l;
2154
2155	if (flags & MPOL_MODE_FLAGS) {
2156		if (buffer + maxlen < p + 2)
2157			return -ENOSPC;
2158		*p++ = '=';
2159
2160		/*
2161		 * Currently, the only defined flags are mutually exclusive
2162		 */
2163		if (flags & MPOL_F_STATIC_NODES)
2164			p += snprintf(p, buffer + maxlen - p, "static");
2165		else if (flags & MPOL_F_RELATIVE_NODES)
2166			p += snprintf(p, buffer + maxlen - p, "relative");
2167	}
2168
2169	if (!nodes_empty(nodes)) {
2170		if (buffer + maxlen < p + 2)
2171			return -ENOSPC;
2172		*p++ = ':';
2173	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2174	}
2175	return p - buffer;
2176}
2177
2178struct numa_maps {
2179	unsigned long pages;
2180	unsigned long anon;
2181	unsigned long active;
2182	unsigned long writeback;
2183	unsigned long mapcount_max;
2184	unsigned long dirty;
2185	unsigned long swapcache;
2186	unsigned long node[MAX_NUMNODES];
2187};
2188
2189static void gather_stats(struct page *page, void *private, int pte_dirty)
2190{
2191	struct numa_maps *md = private;
2192	int count = page_mapcount(page);
2193
2194	md->pages++;
2195	if (pte_dirty || PageDirty(page))
2196		md->dirty++;
2197
2198	if (PageSwapCache(page))
2199		md->swapcache++;
2200
2201	if (PageActive(page))
2202		md->active++;
2203
2204	if (PageWriteback(page))
2205		md->writeback++;
2206
2207	if (PageAnon(page))
2208		md->anon++;
2209
2210	if (count > md->mapcount_max)
2211		md->mapcount_max = count;
2212
2213	md->node[page_to_nid(page)]++;
2214}
2215
2216#ifdef CONFIG_HUGETLB_PAGE
2217static void check_huge_range(struct vm_area_struct *vma,
2218		unsigned long start, unsigned long end,
2219		struct numa_maps *md)
2220{
2221	unsigned long addr;
2222	struct page *page;
2223
2224	for (addr = start; addr < end; addr += HPAGE_SIZE) {
2225		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
2226		pte_t pte;
2227
2228		if (!ptep)
2229			continue;
2230
2231		pte = *ptep;
2232		if (pte_none(pte))
2233			continue;
2234
2235		page = pte_page(pte);
2236		if (!page)
2237			continue;
2238
2239		gather_stats(page, md, pte_dirty(*ptep));
2240	}
2241}
2242#else
2243static inline void check_huge_range(struct vm_area_struct *vma,
2244		unsigned long start, unsigned long end,
2245		struct numa_maps *md)
2246{
2247}
2248#endif
2249
2250/*
2251 * Display pages allocated per node and memory policy via /proc.
2252 */
2253int show_numa_map(struct seq_file *m, void *v)
2254{
2255	struct proc_maps_private *priv = m->private;
2256	struct vm_area_struct *vma = v;
2257	struct numa_maps *md;
2258	struct file *file = vma->vm_file;
2259	struct mm_struct *mm = vma->vm_mm;
2260	struct mempolicy *pol;
2261	int n;
2262	char buffer[50];
2263
2264	if (!mm)
2265		return 0;
2266
2267	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2268	if (!md)
2269		return 0;
2270
2271	pol = get_vma_policy(priv->task, vma, vma->vm_start);
2272	mpol_to_str(buffer, sizeof(buffer), pol, 0);
2273	mpol_cond_put(pol);
2274
2275	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2276
2277	if (file) {
2278		seq_printf(m, " file=");
2279		seq_path(m, &file->f_path, "\n\t= ");
2280	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2281		seq_printf(m, " heap");
2282	} else if (vma->vm_start <= mm->start_stack &&
2283			vma->vm_end >= mm->start_stack) {
2284		seq_printf(m, " stack");
2285	}
2286
2287	if (is_vm_hugetlb_page(vma)) {
2288		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2289		seq_printf(m, " huge");
2290	} else {
2291		check_pgd_range(vma, vma->vm_start, vma->vm_end,
2292			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2293	}
2294
2295	if (!md->pages)
2296		goto out;
2297
2298	if (md->anon)
2299		seq_printf(m," anon=%lu",md->anon);
2300
2301	if (md->dirty)
2302		seq_printf(m," dirty=%lu",md->dirty);
2303
2304	if (md->pages != md->anon && md->pages != md->dirty)
2305		seq_printf(m, " mapped=%lu", md->pages);
2306
2307	if (md->mapcount_max > 1)
2308		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2309
2310	if (md->swapcache)
2311		seq_printf(m," swapcache=%lu", md->swapcache);
2312
2313	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2314		seq_printf(m," active=%lu", md->active);
2315
2316	if (md->writeback)
2317		seq_printf(m," writeback=%lu", md->writeback);
2318
2319	for_each_node_state(n, N_HIGH_MEMORY)
2320		if (md->node[n])
2321			seq_printf(m, " N%d=%lu", n, md->node[n]);
2322out:
2323	seq_putc(m, '\n');
2324	kfree(md);
2325
2326	if (m->count < m->size)
2327		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2328	return 0;
2329}
2330