mempolicy.c revision b4652e8429100ba5c3ddb49499faa1188c98c246
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave     Allocate memory interleaved over a set of nodes,
16 *                with normal fallback if it fails.
17 *                For VMA based allocations this interleaves based on the
18 *                offset into the backing object or offset into the mapping
19 *                for anonymous memory. For process policy an process counter
20 *                is used.
21 *
22 * bind           Only allocate memory on a specific set of nodes,
23 *                no fallback.
24 *                FIXME: memory is allocated starting with the first node
25 *                to the last. It would be better if bind would truly restrict
26 *                the allocation to memory nodes instead
27 *
28 * preferred       Try a specific node first before normal fallback.
29 *                As a special case node -1 here means do the allocation
30 *                on the local CPU. This is normally identical to default,
31 *                but useful to set in a VMA when you have a non default
32 *                process policy.
33 *
34 * default        Allocate on the local node first, or when on a VMA
35 *                use the process policy. This is what Linux always did
36 *		  in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57   fix mmap readahead to honour policy and enable policy for any page cache
58   object
59   statistics for bigpages
60   global policy for page cache? currently it uses process policy. Requires
61   first item above.
62   handle mremap for shared memory (currently ignored for the policy)
63   grows down?
64   make bind policy root only? It can trigger oom much faster and the
65   kernel is not always grateful with that.
66*/
67
68#include <linux/mempolicy.h>
69#include <linux/mm.h>
70#include <linux/highmem.h>
71#include <linux/hugetlb.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
74#include <linux/nodemask.h>
75#include <linux/cpuset.h>
76#include <linux/slab.h>
77#include <linux/string.h>
78#include <linux/module.h>
79#include <linux/nsproxy.h>
80#include <linux/interrupt.h>
81#include <linux/init.h>
82#include <linux/compat.h>
83#include <linux/swap.h>
84#include <linux/seq_file.h>
85#include <linux/proc_fs.h>
86#include <linux/migrate.h>
87#include <linux/ksm.h>
88#include <linux/rmap.h>
89#include <linux/security.h>
90#include <linux/syscalls.h>
91#include <linux/ctype.h>
92#include <linux/mm_inline.h>
93
94#include <asm/tlbflush.h>
95#include <asm/uaccess.h>
96
97#include "internal.h"
98
99/* Internal flags */
100#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
101#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
102#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
103
104static struct kmem_cache *policy_cache;
105static struct kmem_cache *sn_cache;
106
107/* Highest zone. An specific allocation for a zone below that is not
108   policied. */
109enum zone_type policy_zone = 0;
110
111/*
112 * run-time system-wide default policy => local allocation
113 */
114struct mempolicy default_policy = {
115	.refcnt = ATOMIC_INIT(1), /* never free it */
116	.mode = MPOL_PREFERRED,
117	.flags = MPOL_F_LOCAL,
118};
119
120static const struct mempolicy_operations {
121	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
122	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
123} mpol_ops[MPOL_MAX];
124
125/* Check that the nodemask contains at least one populated zone */
126static int is_valid_nodemask(const nodemask_t *nodemask)
127{
128	int nd, k;
129
130	for_each_node_mask(nd, *nodemask) {
131		struct zone *z;
132
133		for (k = 0; k <= policy_zone; k++) {
134			z = &NODE_DATA(nd)->node_zones[k];
135			if (z->present_pages > 0)
136				return 1;
137		}
138	}
139
140	return 0;
141}
142
143static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
144{
145	return pol->flags & MPOL_MODE_FLAGS;
146}
147
148static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
149				   const nodemask_t *rel)
150{
151	nodemask_t tmp;
152	nodes_fold(tmp, *orig, nodes_weight(*rel));
153	nodes_onto(*ret, tmp, *rel);
154}
155
156static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
157{
158	if (nodes_empty(*nodes))
159		return -EINVAL;
160	pol->v.nodes = *nodes;
161	return 0;
162}
163
164static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
165{
166	if (!nodes)
167		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
168	else if (nodes_empty(*nodes))
169		return -EINVAL;			/*  no allowed nodes */
170	else
171		pol->v.preferred_node = first_node(*nodes);
172	return 0;
173}
174
175static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
176{
177	if (!is_valid_nodemask(nodes))
178		return -EINVAL;
179	pol->v.nodes = *nodes;
180	return 0;
181}
182
183/*
184 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
185 * any, for the new policy.  mpol_new() has already validated the nodes
186 * parameter with respect to the policy mode and flags.  But, we need to
187 * handle an empty nodemask with MPOL_PREFERRED here.
188 *
189 * Must be called holding task's alloc_lock to protect task's mems_allowed
190 * and mempolicy.  May also be called holding the mmap_semaphore for write.
191 */
192static int mpol_set_nodemask(struct mempolicy *pol,
193		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
194{
195	int ret;
196
197	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
198	if (pol == NULL)
199		return 0;
200	/* Check N_HIGH_MEMORY */
201	nodes_and(nsc->mask1,
202		  cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
203
204	VM_BUG_ON(!nodes);
205	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
206		nodes = NULL;	/* explicit local allocation */
207	else {
208		if (pol->flags & MPOL_F_RELATIVE_NODES)
209			mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
210		else
211			nodes_and(nsc->mask2, *nodes, nsc->mask1);
212
213		if (mpol_store_user_nodemask(pol))
214			pol->w.user_nodemask = *nodes;
215		else
216			pol->w.cpuset_mems_allowed =
217						cpuset_current_mems_allowed;
218	}
219
220	if (nodes)
221		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
222	else
223		ret = mpol_ops[pol->mode].create(pol, NULL);
224	return ret;
225}
226
227/*
228 * This function just creates a new policy, does some check and simple
229 * initialization. You must invoke mpol_set_nodemask() to set nodes.
230 */
231static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
232				  nodemask_t *nodes)
233{
234	struct mempolicy *policy;
235
236	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
237		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
238
239	if (mode == MPOL_DEFAULT) {
240		if (nodes && !nodes_empty(*nodes))
241			return ERR_PTR(-EINVAL);
242		return NULL;	/* simply delete any existing policy */
243	}
244	VM_BUG_ON(!nodes);
245
246	/*
247	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
248	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
249	 * All other modes require a valid pointer to a non-empty nodemask.
250	 */
251	if (mode == MPOL_PREFERRED) {
252		if (nodes_empty(*nodes)) {
253			if (((flags & MPOL_F_STATIC_NODES) ||
254			     (flags & MPOL_F_RELATIVE_NODES)))
255				return ERR_PTR(-EINVAL);
256		}
257	} else if (nodes_empty(*nodes))
258		return ERR_PTR(-EINVAL);
259	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
260	if (!policy)
261		return ERR_PTR(-ENOMEM);
262	atomic_set(&policy->refcnt, 1);
263	policy->mode = mode;
264	policy->flags = flags;
265
266	return policy;
267}
268
269/* Slow path of a mpol destructor. */
270void __mpol_put(struct mempolicy *p)
271{
272	if (!atomic_dec_and_test(&p->refcnt))
273		return;
274	kmem_cache_free(policy_cache, p);
275}
276
277static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
278{
279}
280
281static void mpol_rebind_nodemask(struct mempolicy *pol,
282				 const nodemask_t *nodes)
283{
284	nodemask_t tmp;
285
286	if (pol->flags & MPOL_F_STATIC_NODES)
287		nodes_and(tmp, pol->w.user_nodemask, *nodes);
288	else if (pol->flags & MPOL_F_RELATIVE_NODES)
289		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
290	else {
291		nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
292			    *nodes);
293		pol->w.cpuset_mems_allowed = *nodes;
294	}
295
296	pol->v.nodes = tmp;
297	if (!node_isset(current->il_next, tmp)) {
298		current->il_next = next_node(current->il_next, tmp);
299		if (current->il_next >= MAX_NUMNODES)
300			current->il_next = first_node(tmp);
301		if (current->il_next >= MAX_NUMNODES)
302			current->il_next = numa_node_id();
303	}
304}
305
306static void mpol_rebind_preferred(struct mempolicy *pol,
307				  const nodemask_t *nodes)
308{
309	nodemask_t tmp;
310
311	if (pol->flags & MPOL_F_STATIC_NODES) {
312		int node = first_node(pol->w.user_nodemask);
313
314		if (node_isset(node, *nodes)) {
315			pol->v.preferred_node = node;
316			pol->flags &= ~MPOL_F_LOCAL;
317		} else
318			pol->flags |= MPOL_F_LOCAL;
319	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
320		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
321		pol->v.preferred_node = first_node(tmp);
322	} else if (!(pol->flags & MPOL_F_LOCAL)) {
323		pol->v.preferred_node = node_remap(pol->v.preferred_node,
324						   pol->w.cpuset_mems_allowed,
325						   *nodes);
326		pol->w.cpuset_mems_allowed = *nodes;
327	}
328}
329
330/* Migrate a policy to a different set of nodes */
331static void mpol_rebind_policy(struct mempolicy *pol,
332			       const nodemask_t *newmask)
333{
334	if (!pol)
335		return;
336	if (!mpol_store_user_nodemask(pol) &&
337	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
338		return;
339	mpol_ops[pol->mode].rebind(pol, newmask);
340}
341
342/*
343 * Wrapper for mpol_rebind_policy() that just requires task
344 * pointer, and updates task mempolicy.
345 *
346 * Called with task's alloc_lock held.
347 */
348
349void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
350{
351	mpol_rebind_policy(tsk->mempolicy, new);
352}
353
354/*
355 * Rebind each vma in mm to new nodemask.
356 *
357 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
358 */
359
360void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
361{
362	struct vm_area_struct *vma;
363
364	down_write(&mm->mmap_sem);
365	for (vma = mm->mmap; vma; vma = vma->vm_next)
366		mpol_rebind_policy(vma->vm_policy, new);
367	up_write(&mm->mmap_sem);
368}
369
370static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
371	[MPOL_DEFAULT] = {
372		.rebind = mpol_rebind_default,
373	},
374	[MPOL_INTERLEAVE] = {
375		.create = mpol_new_interleave,
376		.rebind = mpol_rebind_nodemask,
377	},
378	[MPOL_PREFERRED] = {
379		.create = mpol_new_preferred,
380		.rebind = mpol_rebind_preferred,
381	},
382	[MPOL_BIND] = {
383		.create = mpol_new_bind,
384		.rebind = mpol_rebind_nodemask,
385	},
386};
387
388static void gather_stats(struct page *, void *, int pte_dirty);
389static void migrate_page_add(struct page *page, struct list_head *pagelist,
390				unsigned long flags);
391
392/* Scan through pages checking if pages follow certain conditions. */
393static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
394		unsigned long addr, unsigned long end,
395		const nodemask_t *nodes, unsigned long flags,
396		void *private)
397{
398	pte_t *orig_pte;
399	pte_t *pte;
400	spinlock_t *ptl;
401
402	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
403	do {
404		struct page *page;
405		int nid;
406
407		if (!pte_present(*pte))
408			continue;
409		page = vm_normal_page(vma, addr, *pte);
410		if (!page)
411			continue;
412		/*
413		 * vm_normal_page() filters out zero pages, but there might
414		 * still be PageReserved pages to skip, perhaps in a VDSO.
415		 * And we cannot move PageKsm pages sensibly or safely yet.
416		 */
417		if (PageReserved(page) || PageKsm(page))
418			continue;
419		nid = page_to_nid(page);
420		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
421			continue;
422
423		if (flags & MPOL_MF_STATS)
424			gather_stats(page, private, pte_dirty(*pte));
425		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
426			migrate_page_add(page, private, flags);
427		else
428			break;
429	} while (pte++, addr += PAGE_SIZE, addr != end);
430	pte_unmap_unlock(orig_pte, ptl);
431	return addr != end;
432}
433
434static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
435		unsigned long addr, unsigned long end,
436		const nodemask_t *nodes, unsigned long flags,
437		void *private)
438{
439	pmd_t *pmd;
440	unsigned long next;
441
442	pmd = pmd_offset(pud, addr);
443	do {
444		next = pmd_addr_end(addr, end);
445		if (pmd_none_or_clear_bad(pmd))
446			continue;
447		if (check_pte_range(vma, pmd, addr, next, nodes,
448				    flags, private))
449			return -EIO;
450	} while (pmd++, addr = next, addr != end);
451	return 0;
452}
453
454static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
455		unsigned long addr, unsigned long end,
456		const nodemask_t *nodes, unsigned long flags,
457		void *private)
458{
459	pud_t *pud;
460	unsigned long next;
461
462	pud = pud_offset(pgd, addr);
463	do {
464		next = pud_addr_end(addr, end);
465		if (pud_none_or_clear_bad(pud))
466			continue;
467		if (check_pmd_range(vma, pud, addr, next, nodes,
468				    flags, private))
469			return -EIO;
470	} while (pud++, addr = next, addr != end);
471	return 0;
472}
473
474static inline int check_pgd_range(struct vm_area_struct *vma,
475		unsigned long addr, unsigned long end,
476		const nodemask_t *nodes, unsigned long flags,
477		void *private)
478{
479	pgd_t *pgd;
480	unsigned long next;
481
482	pgd = pgd_offset(vma->vm_mm, addr);
483	do {
484		next = pgd_addr_end(addr, end);
485		if (pgd_none_or_clear_bad(pgd))
486			continue;
487		if (check_pud_range(vma, pgd, addr, next, nodes,
488				    flags, private))
489			return -EIO;
490	} while (pgd++, addr = next, addr != end);
491	return 0;
492}
493
494/*
495 * Check if all pages in a range are on a set of nodes.
496 * If pagelist != NULL then isolate pages from the LRU and
497 * put them on the pagelist.
498 */
499static struct vm_area_struct *
500check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
501		const nodemask_t *nodes, unsigned long flags, void *private)
502{
503	int err;
504	struct vm_area_struct *first, *vma, *prev;
505
506
507	first = find_vma(mm, start);
508	if (!first)
509		return ERR_PTR(-EFAULT);
510	prev = NULL;
511	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
512		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
513			if (!vma->vm_next && vma->vm_end < end)
514				return ERR_PTR(-EFAULT);
515			if (prev && prev->vm_end < vma->vm_start)
516				return ERR_PTR(-EFAULT);
517		}
518		if (!is_vm_hugetlb_page(vma) &&
519		    ((flags & MPOL_MF_STRICT) ||
520		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
521				vma_migratable(vma)))) {
522			unsigned long endvma = vma->vm_end;
523
524			if (endvma > end)
525				endvma = end;
526			if (vma->vm_start > start)
527				start = vma->vm_start;
528			err = check_pgd_range(vma, start, endvma, nodes,
529						flags, private);
530			if (err) {
531				first = ERR_PTR(err);
532				break;
533			}
534		}
535		prev = vma;
536	}
537	return first;
538}
539
540/* Apply policy to a single VMA */
541static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
542{
543	int err = 0;
544	struct mempolicy *old = vma->vm_policy;
545
546	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
547		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
548		 vma->vm_ops, vma->vm_file,
549		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
550
551	if (vma->vm_ops && vma->vm_ops->set_policy)
552		err = vma->vm_ops->set_policy(vma, new);
553	if (!err) {
554		mpol_get(new);
555		vma->vm_policy = new;
556		mpol_put(old);
557	}
558	return err;
559}
560
561/* Step 2: apply policy to a range and do splits. */
562static int mbind_range(struct mm_struct *mm, unsigned long start,
563		       unsigned long end, struct mempolicy *new_pol)
564{
565	struct vm_area_struct *next;
566	struct vm_area_struct *prev;
567	struct vm_area_struct *vma;
568	int err = 0;
569	pgoff_t pgoff;
570	unsigned long vmstart;
571	unsigned long vmend;
572
573	vma = find_vma_prev(mm, start, &prev);
574	if (!vma || vma->vm_start > start)
575		return -EFAULT;
576
577	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
578		next = vma->vm_next;
579		vmstart = max(start, vma->vm_start);
580		vmend   = min(end, vma->vm_end);
581
582		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
583		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
584				  vma->anon_vma, vma->vm_file, pgoff, new_pol);
585		if (prev) {
586			vma = prev;
587			next = vma->vm_next;
588			continue;
589		}
590		if (vma->vm_start != vmstart) {
591			err = split_vma(vma->vm_mm, vma, vmstart, 1);
592			if (err)
593				goto out;
594		}
595		if (vma->vm_end != vmend) {
596			err = split_vma(vma->vm_mm, vma, vmend, 0);
597			if (err)
598				goto out;
599		}
600		err = policy_vma(vma, new_pol);
601		if (err)
602			goto out;
603	}
604
605 out:
606	return err;
607}
608
609/*
610 * Update task->flags PF_MEMPOLICY bit: set iff non-default
611 * mempolicy.  Allows more rapid checking of this (combined perhaps
612 * with other PF_* flag bits) on memory allocation hot code paths.
613 *
614 * If called from outside this file, the task 'p' should -only- be
615 * a newly forked child not yet visible on the task list, because
616 * manipulating the task flags of a visible task is not safe.
617 *
618 * The above limitation is why this routine has the funny name
619 * mpol_fix_fork_child_flag().
620 *
621 * It is also safe to call this with a task pointer of current,
622 * which the static wrapper mpol_set_task_struct_flag() does,
623 * for use within this file.
624 */
625
626void mpol_fix_fork_child_flag(struct task_struct *p)
627{
628	if (p->mempolicy)
629		p->flags |= PF_MEMPOLICY;
630	else
631		p->flags &= ~PF_MEMPOLICY;
632}
633
634static void mpol_set_task_struct_flag(void)
635{
636	mpol_fix_fork_child_flag(current);
637}
638
639/* Set the process memory policy */
640static long do_set_mempolicy(unsigned short mode, unsigned short flags,
641			     nodemask_t *nodes)
642{
643	struct mempolicy *new, *old;
644	struct mm_struct *mm = current->mm;
645	NODEMASK_SCRATCH(scratch);
646	int ret;
647
648	if (!scratch)
649		return -ENOMEM;
650
651	new = mpol_new(mode, flags, nodes);
652	if (IS_ERR(new)) {
653		ret = PTR_ERR(new);
654		goto out;
655	}
656	/*
657	 * prevent changing our mempolicy while show_numa_maps()
658	 * is using it.
659	 * Note:  do_set_mempolicy() can be called at init time
660	 * with no 'mm'.
661	 */
662	if (mm)
663		down_write(&mm->mmap_sem);
664	task_lock(current);
665	ret = mpol_set_nodemask(new, nodes, scratch);
666	if (ret) {
667		task_unlock(current);
668		if (mm)
669			up_write(&mm->mmap_sem);
670		mpol_put(new);
671		goto out;
672	}
673	old = current->mempolicy;
674	current->mempolicy = new;
675	mpol_set_task_struct_flag();
676	if (new && new->mode == MPOL_INTERLEAVE &&
677	    nodes_weight(new->v.nodes))
678		current->il_next = first_node(new->v.nodes);
679	task_unlock(current);
680	if (mm)
681		up_write(&mm->mmap_sem);
682
683	mpol_put(old);
684	ret = 0;
685out:
686	NODEMASK_SCRATCH_FREE(scratch);
687	return ret;
688}
689
690/*
691 * Return nodemask for policy for get_mempolicy() query
692 *
693 * Called with task's alloc_lock held
694 */
695static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
696{
697	nodes_clear(*nodes);
698	if (p == &default_policy)
699		return;
700
701	switch (p->mode) {
702	case MPOL_BIND:
703		/* Fall through */
704	case MPOL_INTERLEAVE:
705		*nodes = p->v.nodes;
706		break;
707	case MPOL_PREFERRED:
708		if (!(p->flags & MPOL_F_LOCAL))
709			node_set(p->v.preferred_node, *nodes);
710		/* else return empty node mask for local allocation */
711		break;
712	default:
713		BUG();
714	}
715}
716
717static int lookup_node(struct mm_struct *mm, unsigned long addr)
718{
719	struct page *p;
720	int err;
721
722	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
723	if (err >= 0) {
724		err = page_to_nid(p);
725		put_page(p);
726	}
727	return err;
728}
729
730/* Retrieve NUMA policy */
731static long do_get_mempolicy(int *policy, nodemask_t *nmask,
732			     unsigned long addr, unsigned long flags)
733{
734	int err;
735	struct mm_struct *mm = current->mm;
736	struct vm_area_struct *vma = NULL;
737	struct mempolicy *pol = current->mempolicy;
738
739	if (flags &
740		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
741		return -EINVAL;
742
743	if (flags & MPOL_F_MEMS_ALLOWED) {
744		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
745			return -EINVAL;
746		*policy = 0;	/* just so it's initialized */
747		task_lock(current);
748		*nmask  = cpuset_current_mems_allowed;
749		task_unlock(current);
750		return 0;
751	}
752
753	if (flags & MPOL_F_ADDR) {
754		/*
755		 * Do NOT fall back to task policy if the
756		 * vma/shared policy at addr is NULL.  We
757		 * want to return MPOL_DEFAULT in this case.
758		 */
759		down_read(&mm->mmap_sem);
760		vma = find_vma_intersection(mm, addr, addr+1);
761		if (!vma) {
762			up_read(&mm->mmap_sem);
763			return -EFAULT;
764		}
765		if (vma->vm_ops && vma->vm_ops->get_policy)
766			pol = vma->vm_ops->get_policy(vma, addr);
767		else
768			pol = vma->vm_policy;
769	} else if (addr)
770		return -EINVAL;
771
772	if (!pol)
773		pol = &default_policy;	/* indicates default behavior */
774
775	if (flags & MPOL_F_NODE) {
776		if (flags & MPOL_F_ADDR) {
777			err = lookup_node(mm, addr);
778			if (err < 0)
779				goto out;
780			*policy = err;
781		} else if (pol == current->mempolicy &&
782				pol->mode == MPOL_INTERLEAVE) {
783			*policy = current->il_next;
784		} else {
785			err = -EINVAL;
786			goto out;
787		}
788	} else {
789		*policy = pol == &default_policy ? MPOL_DEFAULT :
790						pol->mode;
791		/*
792		 * Internal mempolicy flags must be masked off before exposing
793		 * the policy to userspace.
794		 */
795		*policy |= (pol->flags & MPOL_MODE_FLAGS);
796	}
797
798	if (vma) {
799		up_read(&current->mm->mmap_sem);
800		vma = NULL;
801	}
802
803	err = 0;
804	if (nmask) {
805		if (mpol_store_user_nodemask(pol)) {
806			*nmask = pol->w.user_nodemask;
807		} else {
808			task_lock(current);
809			get_policy_nodemask(pol, nmask);
810			task_unlock(current);
811		}
812	}
813
814 out:
815	mpol_cond_put(pol);
816	if (vma)
817		up_read(&current->mm->mmap_sem);
818	return err;
819}
820
821#ifdef CONFIG_MIGRATION
822/*
823 * page migration
824 */
825static void migrate_page_add(struct page *page, struct list_head *pagelist,
826				unsigned long flags)
827{
828	/*
829	 * Avoid migrating a page that is shared with others.
830	 */
831	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
832		if (!isolate_lru_page(page)) {
833			list_add_tail(&page->lru, pagelist);
834			inc_zone_page_state(page, NR_ISOLATED_ANON +
835					    page_is_file_cache(page));
836		}
837	}
838}
839
840static struct page *new_node_page(struct page *page, unsigned long node, int **x)
841{
842	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
843}
844
845/*
846 * Migrate pages from one node to a target node.
847 * Returns error or the number of pages not migrated.
848 */
849static int migrate_to_node(struct mm_struct *mm, int source, int dest,
850			   int flags)
851{
852	nodemask_t nmask;
853	LIST_HEAD(pagelist);
854	int err = 0;
855
856	nodes_clear(nmask);
857	node_set(source, nmask);
858
859	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
860			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
861
862	if (!list_empty(&pagelist))
863		err = migrate_pages(&pagelist, new_node_page, dest, 0);
864
865	return err;
866}
867
868/*
869 * Move pages between the two nodesets so as to preserve the physical
870 * layout as much as possible.
871 *
872 * Returns the number of page that could not be moved.
873 */
874int do_migrate_pages(struct mm_struct *mm,
875	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
876{
877	int busy = 0;
878	int err;
879	nodemask_t tmp;
880
881	err = migrate_prep();
882	if (err)
883		return err;
884
885	down_read(&mm->mmap_sem);
886
887	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
888	if (err)
889		goto out;
890
891	/*
892	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
893	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
894	 * bit in 'tmp', and return that <source, dest> pair for migration.
895	 * The pair of nodemasks 'to' and 'from' define the map.
896	 *
897	 * If no pair of bits is found that way, fallback to picking some
898	 * pair of 'source' and 'dest' bits that are not the same.  If the
899	 * 'source' and 'dest' bits are the same, this represents a node
900	 * that will be migrating to itself, so no pages need move.
901	 *
902	 * If no bits are left in 'tmp', or if all remaining bits left
903	 * in 'tmp' correspond to the same bit in 'to', return false
904	 * (nothing left to migrate).
905	 *
906	 * This lets us pick a pair of nodes to migrate between, such that
907	 * if possible the dest node is not already occupied by some other
908	 * source node, minimizing the risk of overloading the memory on a
909	 * node that would happen if we migrated incoming memory to a node
910	 * before migrating outgoing memory source that same node.
911	 *
912	 * A single scan of tmp is sufficient.  As we go, we remember the
913	 * most recent <s, d> pair that moved (s != d).  If we find a pair
914	 * that not only moved, but what's better, moved to an empty slot
915	 * (d is not set in tmp), then we break out then, with that pair.
916	 * Otherwise when we finish scannng from_tmp, we at least have the
917	 * most recent <s, d> pair that moved.  If we get all the way through
918	 * the scan of tmp without finding any node that moved, much less
919	 * moved to an empty node, then there is nothing left worth migrating.
920	 */
921
922	tmp = *from_nodes;
923	while (!nodes_empty(tmp)) {
924		int s,d;
925		int source = -1;
926		int dest = 0;
927
928		for_each_node_mask(s, tmp) {
929			d = node_remap(s, *from_nodes, *to_nodes);
930			if (s == d)
931				continue;
932
933			source = s;	/* Node moved. Memorize */
934			dest = d;
935
936			/* dest not in remaining from nodes? */
937			if (!node_isset(dest, tmp))
938				break;
939		}
940		if (source == -1)
941			break;
942
943		node_clear(source, tmp);
944		err = migrate_to_node(mm, source, dest, flags);
945		if (err > 0)
946			busy += err;
947		if (err < 0)
948			break;
949	}
950out:
951	up_read(&mm->mmap_sem);
952	if (err < 0)
953		return err;
954	return busy;
955
956}
957
958/*
959 * Allocate a new page for page migration based on vma policy.
960 * Start assuming that page is mapped by vma pointed to by @private.
961 * Search forward from there, if not.  N.B., this assumes that the
962 * list of pages handed to migrate_pages()--which is how we get here--
963 * is in virtual address order.
964 */
965static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
966{
967	struct vm_area_struct *vma = (struct vm_area_struct *)private;
968	unsigned long uninitialized_var(address);
969
970	while (vma) {
971		address = page_address_in_vma(page, vma);
972		if (address != -EFAULT)
973			break;
974		vma = vma->vm_next;
975	}
976
977	/*
978	 * if !vma, alloc_page_vma() will use task or system default policy
979	 */
980	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
981}
982#else
983
984static void migrate_page_add(struct page *page, struct list_head *pagelist,
985				unsigned long flags)
986{
987}
988
989int do_migrate_pages(struct mm_struct *mm,
990	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
991{
992	return -ENOSYS;
993}
994
995static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
996{
997	return NULL;
998}
999#endif
1000
1001static long do_mbind(unsigned long start, unsigned long len,
1002		     unsigned short mode, unsigned short mode_flags,
1003		     nodemask_t *nmask, unsigned long flags)
1004{
1005	struct vm_area_struct *vma;
1006	struct mm_struct *mm = current->mm;
1007	struct mempolicy *new;
1008	unsigned long end;
1009	int err;
1010	LIST_HEAD(pagelist);
1011
1012	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
1013				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1014		return -EINVAL;
1015	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1016		return -EPERM;
1017
1018	if (start & ~PAGE_MASK)
1019		return -EINVAL;
1020
1021	if (mode == MPOL_DEFAULT)
1022		flags &= ~MPOL_MF_STRICT;
1023
1024	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1025	end = start + len;
1026
1027	if (end < start)
1028		return -EINVAL;
1029	if (end == start)
1030		return 0;
1031
1032	new = mpol_new(mode, mode_flags, nmask);
1033	if (IS_ERR(new))
1034		return PTR_ERR(new);
1035
1036	/*
1037	 * If we are using the default policy then operation
1038	 * on discontinuous address spaces is okay after all
1039	 */
1040	if (!new)
1041		flags |= MPOL_MF_DISCONTIG_OK;
1042
1043	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1044		 start, start + len, mode, mode_flags,
1045		 nmask ? nodes_addr(*nmask)[0] : -1);
1046
1047	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1048
1049		err = migrate_prep();
1050		if (err)
1051			goto mpol_out;
1052	}
1053	{
1054		NODEMASK_SCRATCH(scratch);
1055		if (scratch) {
1056			down_write(&mm->mmap_sem);
1057			task_lock(current);
1058			err = mpol_set_nodemask(new, nmask, scratch);
1059			task_unlock(current);
1060			if (err)
1061				up_write(&mm->mmap_sem);
1062		} else
1063			err = -ENOMEM;
1064		NODEMASK_SCRATCH_FREE(scratch);
1065	}
1066	if (err)
1067		goto mpol_out;
1068
1069	vma = check_range(mm, start, end, nmask,
1070			  flags | MPOL_MF_INVERT, &pagelist);
1071
1072	err = PTR_ERR(vma);
1073	if (!IS_ERR(vma)) {
1074		int nr_failed = 0;
1075
1076		err = mbind_range(mm, start, end, new);
1077
1078		if (!list_empty(&pagelist))
1079			nr_failed = migrate_pages(&pagelist, new_vma_page,
1080						(unsigned long)vma, 0);
1081
1082		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1083			err = -EIO;
1084	} else
1085		putback_lru_pages(&pagelist);
1086
1087	up_write(&mm->mmap_sem);
1088 mpol_out:
1089	mpol_put(new);
1090	return err;
1091}
1092
1093/*
1094 * User space interface with variable sized bitmaps for nodelists.
1095 */
1096
1097/* Copy a node mask from user space. */
1098static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1099		     unsigned long maxnode)
1100{
1101	unsigned long k;
1102	unsigned long nlongs;
1103	unsigned long endmask;
1104
1105	--maxnode;
1106	nodes_clear(*nodes);
1107	if (maxnode == 0 || !nmask)
1108		return 0;
1109	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1110		return -EINVAL;
1111
1112	nlongs = BITS_TO_LONGS(maxnode);
1113	if ((maxnode % BITS_PER_LONG) == 0)
1114		endmask = ~0UL;
1115	else
1116		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1117
1118	/* When the user specified more nodes than supported just check
1119	   if the non supported part is all zero. */
1120	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1121		if (nlongs > PAGE_SIZE/sizeof(long))
1122			return -EINVAL;
1123		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1124			unsigned long t;
1125			if (get_user(t, nmask + k))
1126				return -EFAULT;
1127			if (k == nlongs - 1) {
1128				if (t & endmask)
1129					return -EINVAL;
1130			} else if (t)
1131				return -EINVAL;
1132		}
1133		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1134		endmask = ~0UL;
1135	}
1136
1137	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1138		return -EFAULT;
1139	nodes_addr(*nodes)[nlongs-1] &= endmask;
1140	return 0;
1141}
1142
1143/* Copy a kernel node mask to user space */
1144static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1145			      nodemask_t *nodes)
1146{
1147	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1148	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1149
1150	if (copy > nbytes) {
1151		if (copy > PAGE_SIZE)
1152			return -EINVAL;
1153		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1154			return -EFAULT;
1155		copy = nbytes;
1156	}
1157	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1158}
1159
1160SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1161		unsigned long, mode, unsigned long __user *, nmask,
1162		unsigned long, maxnode, unsigned, flags)
1163{
1164	nodemask_t nodes;
1165	int err;
1166	unsigned short mode_flags;
1167
1168	mode_flags = mode & MPOL_MODE_FLAGS;
1169	mode &= ~MPOL_MODE_FLAGS;
1170	if (mode >= MPOL_MAX)
1171		return -EINVAL;
1172	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1173	    (mode_flags & MPOL_F_RELATIVE_NODES))
1174		return -EINVAL;
1175	err = get_nodes(&nodes, nmask, maxnode);
1176	if (err)
1177		return err;
1178	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1179}
1180
1181/* Set the process memory policy */
1182SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1183		unsigned long, maxnode)
1184{
1185	int err;
1186	nodemask_t nodes;
1187	unsigned short flags;
1188
1189	flags = mode & MPOL_MODE_FLAGS;
1190	mode &= ~MPOL_MODE_FLAGS;
1191	if ((unsigned int)mode >= MPOL_MAX)
1192		return -EINVAL;
1193	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1194		return -EINVAL;
1195	err = get_nodes(&nodes, nmask, maxnode);
1196	if (err)
1197		return err;
1198	return do_set_mempolicy(mode, flags, &nodes);
1199}
1200
1201SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1202		const unsigned long __user *, old_nodes,
1203		const unsigned long __user *, new_nodes)
1204{
1205	const struct cred *cred = current_cred(), *tcred;
1206	struct mm_struct *mm;
1207	struct task_struct *task;
1208	nodemask_t old;
1209	nodemask_t new;
1210	nodemask_t task_nodes;
1211	int err;
1212
1213	err = get_nodes(&old, old_nodes, maxnode);
1214	if (err)
1215		return err;
1216
1217	err = get_nodes(&new, new_nodes, maxnode);
1218	if (err)
1219		return err;
1220
1221	/* Find the mm_struct */
1222	read_lock(&tasklist_lock);
1223	task = pid ? find_task_by_vpid(pid) : current;
1224	if (!task) {
1225		read_unlock(&tasklist_lock);
1226		return -ESRCH;
1227	}
1228	mm = get_task_mm(task);
1229	read_unlock(&tasklist_lock);
1230
1231	if (!mm)
1232		return -EINVAL;
1233
1234	/*
1235	 * Check if this process has the right to modify the specified
1236	 * process. The right exists if the process has administrative
1237	 * capabilities, superuser privileges or the same
1238	 * userid as the target process.
1239	 */
1240	rcu_read_lock();
1241	tcred = __task_cred(task);
1242	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1243	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
1244	    !capable(CAP_SYS_NICE)) {
1245		rcu_read_unlock();
1246		err = -EPERM;
1247		goto out;
1248	}
1249	rcu_read_unlock();
1250
1251	task_nodes = cpuset_mems_allowed(task);
1252	/* Is the user allowed to access the target nodes? */
1253	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1254		err = -EPERM;
1255		goto out;
1256	}
1257
1258	if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1259		err = -EINVAL;
1260		goto out;
1261	}
1262
1263	err = security_task_movememory(task);
1264	if (err)
1265		goto out;
1266
1267	err = do_migrate_pages(mm, &old, &new,
1268		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1269out:
1270	mmput(mm);
1271	return err;
1272}
1273
1274
1275/* Retrieve NUMA policy */
1276SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1277		unsigned long __user *, nmask, unsigned long, maxnode,
1278		unsigned long, addr, unsigned long, flags)
1279{
1280	int err;
1281	int uninitialized_var(pval);
1282	nodemask_t nodes;
1283
1284	if (nmask != NULL && maxnode < MAX_NUMNODES)
1285		return -EINVAL;
1286
1287	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1288
1289	if (err)
1290		return err;
1291
1292	if (policy && put_user(pval, policy))
1293		return -EFAULT;
1294
1295	if (nmask)
1296		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1297
1298	return err;
1299}
1300
1301#ifdef CONFIG_COMPAT
1302
1303asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1304				     compat_ulong_t __user *nmask,
1305				     compat_ulong_t maxnode,
1306				     compat_ulong_t addr, compat_ulong_t flags)
1307{
1308	long err;
1309	unsigned long __user *nm = NULL;
1310	unsigned long nr_bits, alloc_size;
1311	DECLARE_BITMAP(bm, MAX_NUMNODES);
1312
1313	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1314	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1315
1316	if (nmask)
1317		nm = compat_alloc_user_space(alloc_size);
1318
1319	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1320
1321	if (!err && nmask) {
1322		err = copy_from_user(bm, nm, alloc_size);
1323		/* ensure entire bitmap is zeroed */
1324		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1325		err |= compat_put_bitmap(nmask, bm, nr_bits);
1326	}
1327
1328	return err;
1329}
1330
1331asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1332				     compat_ulong_t maxnode)
1333{
1334	long err = 0;
1335	unsigned long __user *nm = NULL;
1336	unsigned long nr_bits, alloc_size;
1337	DECLARE_BITMAP(bm, MAX_NUMNODES);
1338
1339	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1340	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1341
1342	if (nmask) {
1343		err = compat_get_bitmap(bm, nmask, nr_bits);
1344		nm = compat_alloc_user_space(alloc_size);
1345		err |= copy_to_user(nm, bm, alloc_size);
1346	}
1347
1348	if (err)
1349		return -EFAULT;
1350
1351	return sys_set_mempolicy(mode, nm, nr_bits+1);
1352}
1353
1354asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1355			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1356			     compat_ulong_t maxnode, compat_ulong_t flags)
1357{
1358	long err = 0;
1359	unsigned long __user *nm = NULL;
1360	unsigned long nr_bits, alloc_size;
1361	nodemask_t bm;
1362
1363	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1364	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1365
1366	if (nmask) {
1367		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1368		nm = compat_alloc_user_space(alloc_size);
1369		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1370	}
1371
1372	if (err)
1373		return -EFAULT;
1374
1375	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1376}
1377
1378#endif
1379
1380/*
1381 * get_vma_policy(@task, @vma, @addr)
1382 * @task - task for fallback if vma policy == default
1383 * @vma   - virtual memory area whose policy is sought
1384 * @addr  - address in @vma for shared policy lookup
1385 *
1386 * Returns effective policy for a VMA at specified address.
1387 * Falls back to @task or system default policy, as necessary.
1388 * Current or other task's task mempolicy and non-shared vma policies
1389 * are protected by the task's mmap_sem, which must be held for read by
1390 * the caller.
1391 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1392 * count--added by the get_policy() vm_op, as appropriate--to protect against
1393 * freeing by another task.  It is the caller's responsibility to free the
1394 * extra reference for shared policies.
1395 */
1396static struct mempolicy *get_vma_policy(struct task_struct *task,
1397		struct vm_area_struct *vma, unsigned long addr)
1398{
1399	struct mempolicy *pol = task->mempolicy;
1400
1401	if (vma) {
1402		if (vma->vm_ops && vma->vm_ops->get_policy) {
1403			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1404									addr);
1405			if (vpol)
1406				pol = vpol;
1407		} else if (vma->vm_policy)
1408			pol = vma->vm_policy;
1409	}
1410	if (!pol)
1411		pol = &default_policy;
1412	return pol;
1413}
1414
1415/*
1416 * Return a nodemask representing a mempolicy for filtering nodes for
1417 * page allocation
1418 */
1419static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1420{
1421	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1422	if (unlikely(policy->mode == MPOL_BIND) &&
1423			gfp_zone(gfp) >= policy_zone &&
1424			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1425		return &policy->v.nodes;
1426
1427	return NULL;
1428}
1429
1430/* Return a zonelist indicated by gfp for node representing a mempolicy */
1431static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1432{
1433	int nd = numa_node_id();
1434
1435	switch (policy->mode) {
1436	case MPOL_PREFERRED:
1437		if (!(policy->flags & MPOL_F_LOCAL))
1438			nd = policy->v.preferred_node;
1439		break;
1440	case MPOL_BIND:
1441		/*
1442		 * Normally, MPOL_BIND allocations are node-local within the
1443		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1444		 * current node isn't part of the mask, we use the zonelist for
1445		 * the first node in the mask instead.
1446		 */
1447		if (unlikely(gfp & __GFP_THISNODE) &&
1448				unlikely(!node_isset(nd, policy->v.nodes)))
1449			nd = first_node(policy->v.nodes);
1450		break;
1451	default:
1452		BUG();
1453	}
1454	return node_zonelist(nd, gfp);
1455}
1456
1457/* Do dynamic interleaving for a process */
1458static unsigned interleave_nodes(struct mempolicy *policy)
1459{
1460	unsigned nid, next;
1461	struct task_struct *me = current;
1462
1463	nid = me->il_next;
1464	next = next_node(nid, policy->v.nodes);
1465	if (next >= MAX_NUMNODES)
1466		next = first_node(policy->v.nodes);
1467	if (next < MAX_NUMNODES)
1468		me->il_next = next;
1469	return nid;
1470}
1471
1472/*
1473 * Depending on the memory policy provide a node from which to allocate the
1474 * next slab entry.
1475 * @policy must be protected by freeing by the caller.  If @policy is
1476 * the current task's mempolicy, this protection is implicit, as only the
1477 * task can change it's policy.  The system default policy requires no
1478 * such protection.
1479 */
1480unsigned slab_node(struct mempolicy *policy)
1481{
1482	if (!policy || policy->flags & MPOL_F_LOCAL)
1483		return numa_node_id();
1484
1485	switch (policy->mode) {
1486	case MPOL_PREFERRED:
1487		/*
1488		 * handled MPOL_F_LOCAL above
1489		 */
1490		return policy->v.preferred_node;
1491
1492	case MPOL_INTERLEAVE:
1493		return interleave_nodes(policy);
1494
1495	case MPOL_BIND: {
1496		/*
1497		 * Follow bind policy behavior and start allocation at the
1498		 * first node.
1499		 */
1500		struct zonelist *zonelist;
1501		struct zone *zone;
1502		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1503		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1504		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1505							&policy->v.nodes,
1506							&zone);
1507		return zone->node;
1508	}
1509
1510	default:
1511		BUG();
1512	}
1513}
1514
1515/* Do static interleaving for a VMA with known offset. */
1516static unsigned offset_il_node(struct mempolicy *pol,
1517		struct vm_area_struct *vma, unsigned long off)
1518{
1519	unsigned nnodes = nodes_weight(pol->v.nodes);
1520	unsigned target;
1521	int c;
1522	int nid = -1;
1523
1524	if (!nnodes)
1525		return numa_node_id();
1526	target = (unsigned int)off % nnodes;
1527	c = 0;
1528	do {
1529		nid = next_node(nid, pol->v.nodes);
1530		c++;
1531	} while (c <= target);
1532	return nid;
1533}
1534
1535/* Determine a node number for interleave */
1536static inline unsigned interleave_nid(struct mempolicy *pol,
1537		 struct vm_area_struct *vma, unsigned long addr, int shift)
1538{
1539	if (vma) {
1540		unsigned long off;
1541
1542		/*
1543		 * for small pages, there is no difference between
1544		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1545		 * for huge pages, since vm_pgoff is in units of small
1546		 * pages, we need to shift off the always 0 bits to get
1547		 * a useful offset.
1548		 */
1549		BUG_ON(shift < PAGE_SHIFT);
1550		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1551		off += (addr - vma->vm_start) >> shift;
1552		return offset_il_node(pol, vma, off);
1553	} else
1554		return interleave_nodes(pol);
1555}
1556
1557#ifdef CONFIG_HUGETLBFS
1558/*
1559 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1560 * @vma = virtual memory area whose policy is sought
1561 * @addr = address in @vma for shared policy lookup and interleave policy
1562 * @gfp_flags = for requested zone
1563 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1564 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1565 *
1566 * Returns a zonelist suitable for a huge page allocation and a pointer
1567 * to the struct mempolicy for conditional unref after allocation.
1568 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1569 * @nodemask for filtering the zonelist.
1570 */
1571struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1572				gfp_t gfp_flags, struct mempolicy **mpol,
1573				nodemask_t **nodemask)
1574{
1575	struct zonelist *zl;
1576
1577	*mpol = get_vma_policy(current, vma, addr);
1578	*nodemask = NULL;	/* assume !MPOL_BIND */
1579
1580	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1581		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1582				huge_page_shift(hstate_vma(vma))), gfp_flags);
1583	} else {
1584		zl = policy_zonelist(gfp_flags, *mpol);
1585		if ((*mpol)->mode == MPOL_BIND)
1586			*nodemask = &(*mpol)->v.nodes;
1587	}
1588	return zl;
1589}
1590
1591/*
1592 * init_nodemask_of_mempolicy
1593 *
1594 * If the current task's mempolicy is "default" [NULL], return 'false'
1595 * to indicate default policy.  Otherwise, extract the policy nodemask
1596 * for 'bind' or 'interleave' policy into the argument nodemask, or
1597 * initialize the argument nodemask to contain the single node for
1598 * 'preferred' or 'local' policy and return 'true' to indicate presence
1599 * of non-default mempolicy.
1600 *
1601 * We don't bother with reference counting the mempolicy [mpol_get/put]
1602 * because the current task is examining it's own mempolicy and a task's
1603 * mempolicy is only ever changed by the task itself.
1604 *
1605 * N.B., it is the caller's responsibility to free a returned nodemask.
1606 */
1607bool init_nodemask_of_mempolicy(nodemask_t *mask)
1608{
1609	struct mempolicy *mempolicy;
1610	int nid;
1611
1612	if (!(mask && current->mempolicy))
1613		return false;
1614
1615	mempolicy = current->mempolicy;
1616	switch (mempolicy->mode) {
1617	case MPOL_PREFERRED:
1618		if (mempolicy->flags & MPOL_F_LOCAL)
1619			nid = numa_node_id();
1620		else
1621			nid = mempolicy->v.preferred_node;
1622		init_nodemask_of_node(mask, nid);
1623		break;
1624
1625	case MPOL_BIND:
1626		/* Fall through */
1627	case MPOL_INTERLEAVE:
1628		*mask =  mempolicy->v.nodes;
1629		break;
1630
1631	default:
1632		BUG();
1633	}
1634
1635	return true;
1636}
1637#endif
1638
1639/* Allocate a page in interleaved policy.
1640   Own path because it needs to do special accounting. */
1641static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1642					unsigned nid)
1643{
1644	struct zonelist *zl;
1645	struct page *page;
1646
1647	zl = node_zonelist(nid, gfp);
1648	page = __alloc_pages(gfp, order, zl);
1649	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1650		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1651	return page;
1652}
1653
1654/**
1655 * 	alloc_page_vma	- Allocate a page for a VMA.
1656 *
1657 * 	@gfp:
1658 *      %GFP_USER    user allocation.
1659 *      %GFP_KERNEL  kernel allocations,
1660 *      %GFP_HIGHMEM highmem/user allocations,
1661 *      %GFP_FS      allocation should not call back into a file system.
1662 *      %GFP_ATOMIC  don't sleep.
1663 *
1664 * 	@vma:  Pointer to VMA or NULL if not available.
1665 *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1666 *
1667 * 	This function allocates a page from the kernel page pool and applies
1668 *	a NUMA policy associated with the VMA or the current process.
1669 *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1670 *	mm_struct of the VMA to prevent it from going away. Should be used for
1671 *	all allocations for pages that will be mapped into
1672 * 	user space. Returns NULL when no page can be allocated.
1673 *
1674 *	Should be called with the mm_sem of the vma hold.
1675 */
1676struct page *
1677alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1678{
1679	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1680	struct zonelist *zl;
1681
1682	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1683		unsigned nid;
1684
1685		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1686		mpol_cond_put(pol);
1687		return alloc_page_interleave(gfp, 0, nid);
1688	}
1689	zl = policy_zonelist(gfp, pol);
1690	if (unlikely(mpol_needs_cond_ref(pol))) {
1691		/*
1692		 * slow path: ref counted shared policy
1693		 */
1694		struct page *page =  __alloc_pages_nodemask(gfp, 0,
1695						zl, policy_nodemask(gfp, pol));
1696		__mpol_put(pol);
1697		return page;
1698	}
1699	/*
1700	 * fast path:  default or task policy
1701	 */
1702	return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1703}
1704
1705/**
1706 * 	alloc_pages_current - Allocate pages.
1707 *
1708 *	@gfp:
1709 *		%GFP_USER   user allocation,
1710 *      	%GFP_KERNEL kernel allocation,
1711 *      	%GFP_HIGHMEM highmem allocation,
1712 *      	%GFP_FS     don't call back into a file system.
1713 *      	%GFP_ATOMIC don't sleep.
1714 *	@order: Power of two of allocation size in pages. 0 is a single page.
1715 *
1716 *	Allocate a page from the kernel page pool.  When not in
1717 *	interrupt context and apply the current process NUMA policy.
1718 *	Returns NULL when no page can be allocated.
1719 *
1720 *	Don't call cpuset_update_task_memory_state() unless
1721 *	1) it's ok to take cpuset_sem (can WAIT), and
1722 *	2) allocating for current task (not interrupt).
1723 */
1724struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1725{
1726	struct mempolicy *pol = current->mempolicy;
1727
1728	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1729		pol = &default_policy;
1730
1731	/*
1732	 * No reference counting needed for current->mempolicy
1733	 * nor system default_policy
1734	 */
1735	if (pol->mode == MPOL_INTERLEAVE)
1736		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1737	return __alloc_pages_nodemask(gfp, order,
1738			policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1739}
1740EXPORT_SYMBOL(alloc_pages_current);
1741
1742/*
1743 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1744 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1745 * with the mems_allowed returned by cpuset_mems_allowed().  This
1746 * keeps mempolicies cpuset relative after its cpuset moves.  See
1747 * further kernel/cpuset.c update_nodemask().
1748 */
1749
1750/* Slow path of a mempolicy duplicate */
1751struct mempolicy *__mpol_dup(struct mempolicy *old)
1752{
1753	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1754
1755	if (!new)
1756		return ERR_PTR(-ENOMEM);
1757	rcu_read_lock();
1758	if (current_cpuset_is_being_rebound()) {
1759		nodemask_t mems = cpuset_mems_allowed(current);
1760		mpol_rebind_policy(old, &mems);
1761	}
1762	rcu_read_unlock();
1763	*new = *old;
1764	atomic_set(&new->refcnt, 1);
1765	return new;
1766}
1767
1768/*
1769 * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1770 * eliminate the * MPOL_F_* flags that require conditional ref and
1771 * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1772 * after return.  Use the returned value.
1773 *
1774 * Allows use of a mempolicy for, e.g., multiple allocations with a single
1775 * policy lookup, even if the policy needs/has extra ref on lookup.
1776 * shmem_readahead needs this.
1777 */
1778struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1779						struct mempolicy *frompol)
1780{
1781	if (!mpol_needs_cond_ref(frompol))
1782		return frompol;
1783
1784	*tompol = *frompol;
1785	tompol->flags &= ~MPOL_F_SHARED;	/* copy doesn't need unref */
1786	__mpol_put(frompol);
1787	return tompol;
1788}
1789
1790/* Slow path of a mempolicy comparison */
1791int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1792{
1793	if (!a || !b)
1794		return 0;
1795	if (a->mode != b->mode)
1796		return 0;
1797	if (a->flags != b->flags)
1798		return 0;
1799	if (mpol_store_user_nodemask(a))
1800		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
1801			return 0;
1802
1803	switch (a->mode) {
1804	case MPOL_BIND:
1805		/* Fall through */
1806	case MPOL_INTERLEAVE:
1807		return nodes_equal(a->v.nodes, b->v.nodes);
1808	case MPOL_PREFERRED:
1809		return a->v.preferred_node == b->v.preferred_node &&
1810			a->flags == b->flags;
1811	default:
1812		BUG();
1813		return 0;
1814	}
1815}
1816
1817/*
1818 * Shared memory backing store policy support.
1819 *
1820 * Remember policies even when nobody has shared memory mapped.
1821 * The policies are kept in Red-Black tree linked from the inode.
1822 * They are protected by the sp->lock spinlock, which should be held
1823 * for any accesses to the tree.
1824 */
1825
1826/* lookup first element intersecting start-end */
1827/* Caller holds sp->lock */
1828static struct sp_node *
1829sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1830{
1831	struct rb_node *n = sp->root.rb_node;
1832
1833	while (n) {
1834		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1835
1836		if (start >= p->end)
1837			n = n->rb_right;
1838		else if (end <= p->start)
1839			n = n->rb_left;
1840		else
1841			break;
1842	}
1843	if (!n)
1844		return NULL;
1845	for (;;) {
1846		struct sp_node *w = NULL;
1847		struct rb_node *prev = rb_prev(n);
1848		if (!prev)
1849			break;
1850		w = rb_entry(prev, struct sp_node, nd);
1851		if (w->end <= start)
1852			break;
1853		n = prev;
1854	}
1855	return rb_entry(n, struct sp_node, nd);
1856}
1857
1858/* Insert a new shared policy into the list. */
1859/* Caller holds sp->lock */
1860static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1861{
1862	struct rb_node **p = &sp->root.rb_node;
1863	struct rb_node *parent = NULL;
1864	struct sp_node *nd;
1865
1866	while (*p) {
1867		parent = *p;
1868		nd = rb_entry(parent, struct sp_node, nd);
1869		if (new->start < nd->start)
1870			p = &(*p)->rb_left;
1871		else if (new->end > nd->end)
1872			p = &(*p)->rb_right;
1873		else
1874			BUG();
1875	}
1876	rb_link_node(&new->nd, parent, p);
1877	rb_insert_color(&new->nd, &sp->root);
1878	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1879		 new->policy ? new->policy->mode : 0);
1880}
1881
1882/* Find shared policy intersecting idx */
1883struct mempolicy *
1884mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1885{
1886	struct mempolicy *pol = NULL;
1887	struct sp_node *sn;
1888
1889	if (!sp->root.rb_node)
1890		return NULL;
1891	spin_lock(&sp->lock);
1892	sn = sp_lookup(sp, idx, idx+1);
1893	if (sn) {
1894		mpol_get(sn->policy);
1895		pol = sn->policy;
1896	}
1897	spin_unlock(&sp->lock);
1898	return pol;
1899}
1900
1901static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1902{
1903	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1904	rb_erase(&n->nd, &sp->root);
1905	mpol_put(n->policy);
1906	kmem_cache_free(sn_cache, n);
1907}
1908
1909static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1910				struct mempolicy *pol)
1911{
1912	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1913
1914	if (!n)
1915		return NULL;
1916	n->start = start;
1917	n->end = end;
1918	mpol_get(pol);
1919	pol->flags |= MPOL_F_SHARED;	/* for unref */
1920	n->policy = pol;
1921	return n;
1922}
1923
1924/* Replace a policy range. */
1925static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1926				 unsigned long end, struct sp_node *new)
1927{
1928	struct sp_node *n, *new2 = NULL;
1929
1930restart:
1931	spin_lock(&sp->lock);
1932	n = sp_lookup(sp, start, end);
1933	/* Take care of old policies in the same range. */
1934	while (n && n->start < end) {
1935		struct rb_node *next = rb_next(&n->nd);
1936		if (n->start >= start) {
1937			if (n->end <= end)
1938				sp_delete(sp, n);
1939			else
1940				n->start = end;
1941		} else {
1942			/* Old policy spanning whole new range. */
1943			if (n->end > end) {
1944				if (!new2) {
1945					spin_unlock(&sp->lock);
1946					new2 = sp_alloc(end, n->end, n->policy);
1947					if (!new2)
1948						return -ENOMEM;
1949					goto restart;
1950				}
1951				n->end = start;
1952				sp_insert(sp, new2);
1953				new2 = NULL;
1954				break;
1955			} else
1956				n->end = start;
1957		}
1958		if (!next)
1959			break;
1960		n = rb_entry(next, struct sp_node, nd);
1961	}
1962	if (new)
1963		sp_insert(sp, new);
1964	spin_unlock(&sp->lock);
1965	if (new2) {
1966		mpol_put(new2->policy);
1967		kmem_cache_free(sn_cache, new2);
1968	}
1969	return 0;
1970}
1971
1972/**
1973 * mpol_shared_policy_init - initialize shared policy for inode
1974 * @sp: pointer to inode shared policy
1975 * @mpol:  struct mempolicy to install
1976 *
1977 * Install non-NULL @mpol in inode's shared policy rb-tree.
1978 * On entry, the current task has a reference on a non-NULL @mpol.
1979 * This must be released on exit.
1980 * This is called at get_inode() calls and we can use GFP_KERNEL.
1981 */
1982void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1983{
1984	int ret;
1985
1986	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
1987	spin_lock_init(&sp->lock);
1988
1989	if (mpol) {
1990		struct vm_area_struct pvma;
1991		struct mempolicy *new;
1992		NODEMASK_SCRATCH(scratch);
1993
1994		if (!scratch)
1995			return;
1996		/* contextualize the tmpfs mount point mempolicy */
1997		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1998		if (IS_ERR(new)) {
1999			mpol_put(mpol);	/* drop our ref on sb mpol */
2000			NODEMASK_SCRATCH_FREE(scratch);
2001			return;		/* no valid nodemask intersection */
2002		}
2003
2004		task_lock(current);
2005		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2006		task_unlock(current);
2007		mpol_put(mpol);	/* drop our ref on sb mpol */
2008		if (ret) {
2009			NODEMASK_SCRATCH_FREE(scratch);
2010			mpol_put(new);
2011			return;
2012		}
2013
2014		/* Create pseudo-vma that contains just the policy */
2015		memset(&pvma, 0, sizeof(struct vm_area_struct));
2016		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2017		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2018		mpol_put(new);			/* drop initial ref */
2019		NODEMASK_SCRATCH_FREE(scratch);
2020	}
2021}
2022
2023int mpol_set_shared_policy(struct shared_policy *info,
2024			struct vm_area_struct *vma, struct mempolicy *npol)
2025{
2026	int err;
2027	struct sp_node *new = NULL;
2028	unsigned long sz = vma_pages(vma);
2029
2030	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2031		 vma->vm_pgoff,
2032		 sz, npol ? npol->mode : -1,
2033		 npol ? npol->flags : -1,
2034		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
2035
2036	if (npol) {
2037		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2038		if (!new)
2039			return -ENOMEM;
2040	}
2041	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2042	if (err && new)
2043		kmem_cache_free(sn_cache, new);
2044	return err;
2045}
2046
2047/* Free a backing policy store on inode delete. */
2048void mpol_free_shared_policy(struct shared_policy *p)
2049{
2050	struct sp_node *n;
2051	struct rb_node *next;
2052
2053	if (!p->root.rb_node)
2054		return;
2055	spin_lock(&p->lock);
2056	next = rb_first(&p->root);
2057	while (next) {
2058		n = rb_entry(next, struct sp_node, nd);
2059		next = rb_next(&n->nd);
2060		rb_erase(&n->nd, &p->root);
2061		mpol_put(n->policy);
2062		kmem_cache_free(sn_cache, n);
2063	}
2064	spin_unlock(&p->lock);
2065}
2066
2067/* assumes fs == KERNEL_DS */
2068void __init numa_policy_init(void)
2069{
2070	nodemask_t interleave_nodes;
2071	unsigned long largest = 0;
2072	int nid, prefer = 0;
2073
2074	policy_cache = kmem_cache_create("numa_policy",
2075					 sizeof(struct mempolicy),
2076					 0, SLAB_PANIC, NULL);
2077
2078	sn_cache = kmem_cache_create("shared_policy_node",
2079				     sizeof(struct sp_node),
2080				     0, SLAB_PANIC, NULL);
2081
2082	/*
2083	 * Set interleaving policy for system init. Interleaving is only
2084	 * enabled across suitably sized nodes (default is >= 16MB), or
2085	 * fall back to the largest node if they're all smaller.
2086	 */
2087	nodes_clear(interleave_nodes);
2088	for_each_node_state(nid, N_HIGH_MEMORY) {
2089		unsigned long total_pages = node_present_pages(nid);
2090
2091		/* Preserve the largest node */
2092		if (largest < total_pages) {
2093			largest = total_pages;
2094			prefer = nid;
2095		}
2096
2097		/* Interleave this node? */
2098		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2099			node_set(nid, interleave_nodes);
2100	}
2101
2102	/* All too small, use the largest */
2103	if (unlikely(nodes_empty(interleave_nodes)))
2104		node_set(prefer, interleave_nodes);
2105
2106	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2107		printk("numa_policy_init: interleaving failed\n");
2108}
2109
2110/* Reset policy of current process to default */
2111void numa_default_policy(void)
2112{
2113	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2114}
2115
2116/*
2117 * Parse and format mempolicy from/to strings
2118 */
2119
2120/*
2121 * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
2122 * Used only for mpol_parse_str() and mpol_to_str()
2123 */
2124#define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
2125static const char * const policy_types[] =
2126	{ "default", "prefer", "bind", "interleave", "local" };
2127
2128
2129#ifdef CONFIG_TMPFS
2130/**
2131 * mpol_parse_str - parse string to mempolicy
2132 * @str:  string containing mempolicy to parse
2133 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2134 * @no_context:  flag whether to "contextualize" the mempolicy
2135 *
2136 * Format of input:
2137 *	<mode>[=<flags>][:<nodelist>]
2138 *
2139 * if @no_context is true, save the input nodemask in w.user_nodemask in
2140 * the returned mempolicy.  This will be used to "clone" the mempolicy in
2141 * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
2142 * mount option.  Note that if 'static' or 'relative' mode flags were
2143 * specified, the input nodemask will already have been saved.  Saving
2144 * it again is redundant, but safe.
2145 *
2146 * On success, returns 0, else 1
2147 */
2148int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2149{
2150	struct mempolicy *new = NULL;
2151	unsigned short mode;
2152	unsigned short uninitialized_var(mode_flags);
2153	nodemask_t nodes;
2154	char *nodelist = strchr(str, ':');
2155	char *flags = strchr(str, '=');
2156	int err = 1;
2157
2158	if (nodelist) {
2159		/* NUL-terminate mode or flags string */
2160		*nodelist++ = '\0';
2161		if (nodelist_parse(nodelist, nodes))
2162			goto out;
2163		if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2164			goto out;
2165	} else
2166		nodes_clear(nodes);
2167
2168	if (flags)
2169		*flags++ = '\0';	/* terminate mode string */
2170
2171	for (mode = 0; mode <= MPOL_LOCAL; mode++) {
2172		if (!strcmp(str, policy_types[mode])) {
2173			break;
2174		}
2175	}
2176	if (mode > MPOL_LOCAL)
2177		goto out;
2178
2179	switch (mode) {
2180	case MPOL_PREFERRED:
2181		/*
2182		 * Insist on a nodelist of one node only
2183		 */
2184		if (nodelist) {
2185			char *rest = nodelist;
2186			while (isdigit(*rest))
2187				rest++;
2188			if (*rest)
2189				goto out;
2190		}
2191		break;
2192	case MPOL_INTERLEAVE:
2193		/*
2194		 * Default to online nodes with memory if no nodelist
2195		 */
2196		if (!nodelist)
2197			nodes = node_states[N_HIGH_MEMORY];
2198		break;
2199	case MPOL_LOCAL:
2200		/*
2201		 * Don't allow a nodelist;  mpol_new() checks flags
2202		 */
2203		if (nodelist)
2204			goto out;
2205		mode = MPOL_PREFERRED;
2206		break;
2207	case MPOL_DEFAULT:
2208		/*
2209		 * Insist on a empty nodelist
2210		 */
2211		if (!nodelist)
2212			err = 0;
2213		goto out;
2214	case MPOL_BIND:
2215		/*
2216		 * Insist on a nodelist
2217		 */
2218		if (!nodelist)
2219			goto out;
2220	}
2221
2222	mode_flags = 0;
2223	if (flags) {
2224		/*
2225		 * Currently, we only support two mutually exclusive
2226		 * mode flags.
2227		 */
2228		if (!strcmp(flags, "static"))
2229			mode_flags |= MPOL_F_STATIC_NODES;
2230		else if (!strcmp(flags, "relative"))
2231			mode_flags |= MPOL_F_RELATIVE_NODES;
2232		else
2233			goto out;
2234	}
2235
2236	new = mpol_new(mode, mode_flags, &nodes);
2237	if (IS_ERR(new))
2238		goto out;
2239
2240	if (no_context) {
2241		/* save for contextualization */
2242		new->w.user_nodemask = nodes;
2243	} else {
2244		int ret;
2245		NODEMASK_SCRATCH(scratch);
2246		if (scratch) {
2247			task_lock(current);
2248			ret = mpol_set_nodemask(new, &nodes, scratch);
2249			task_unlock(current);
2250		} else
2251			ret = -ENOMEM;
2252		NODEMASK_SCRATCH_FREE(scratch);
2253		if (ret) {
2254			mpol_put(new);
2255			goto out;
2256		}
2257	}
2258	err = 0;
2259
2260out:
2261	/* Restore string for error message */
2262	if (nodelist)
2263		*--nodelist = ':';
2264	if (flags)
2265		*--flags = '=';
2266	if (!err)
2267		*mpol = new;
2268	return err;
2269}
2270#endif /* CONFIG_TMPFS */
2271
2272/**
2273 * mpol_to_str - format a mempolicy structure for printing
2274 * @buffer:  to contain formatted mempolicy string
2275 * @maxlen:  length of @buffer
2276 * @pol:  pointer to mempolicy to be formatted
2277 * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2278 *
2279 * Convert a mempolicy into a string.
2280 * Returns the number of characters in buffer (if positive)
2281 * or an error (negative)
2282 */
2283int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2284{
2285	char *p = buffer;
2286	int l;
2287	nodemask_t nodes;
2288	unsigned short mode;
2289	unsigned short flags = pol ? pol->flags : 0;
2290
2291	/*
2292	 * Sanity check:  room for longest mode, flag and some nodes
2293	 */
2294	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2295
2296	if (!pol || pol == &default_policy)
2297		mode = MPOL_DEFAULT;
2298	else
2299		mode = pol->mode;
2300
2301	switch (mode) {
2302	case MPOL_DEFAULT:
2303		nodes_clear(nodes);
2304		break;
2305
2306	case MPOL_PREFERRED:
2307		nodes_clear(nodes);
2308		if (flags & MPOL_F_LOCAL)
2309			mode = MPOL_LOCAL;	/* pseudo-policy */
2310		else
2311			node_set(pol->v.preferred_node, nodes);
2312		break;
2313
2314	case MPOL_BIND:
2315		/* Fall through */
2316	case MPOL_INTERLEAVE:
2317		if (no_context)
2318			nodes = pol->w.user_nodemask;
2319		else
2320			nodes = pol->v.nodes;
2321		break;
2322
2323	default:
2324		BUG();
2325	}
2326
2327	l = strlen(policy_types[mode]);
2328	if (buffer + maxlen < p + l + 1)
2329		return -ENOSPC;
2330
2331	strcpy(p, policy_types[mode]);
2332	p += l;
2333
2334	if (flags & MPOL_MODE_FLAGS) {
2335		if (buffer + maxlen < p + 2)
2336			return -ENOSPC;
2337		*p++ = '=';
2338
2339		/*
2340		 * Currently, the only defined flags are mutually exclusive
2341		 */
2342		if (flags & MPOL_F_STATIC_NODES)
2343			p += snprintf(p, buffer + maxlen - p, "static");
2344		else if (flags & MPOL_F_RELATIVE_NODES)
2345			p += snprintf(p, buffer + maxlen - p, "relative");
2346	}
2347
2348	if (!nodes_empty(nodes)) {
2349		if (buffer + maxlen < p + 2)
2350			return -ENOSPC;
2351		*p++ = ':';
2352	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2353	}
2354	return p - buffer;
2355}
2356
2357struct numa_maps {
2358	unsigned long pages;
2359	unsigned long anon;
2360	unsigned long active;
2361	unsigned long writeback;
2362	unsigned long mapcount_max;
2363	unsigned long dirty;
2364	unsigned long swapcache;
2365	unsigned long node[MAX_NUMNODES];
2366};
2367
2368static void gather_stats(struct page *page, void *private, int pte_dirty)
2369{
2370	struct numa_maps *md = private;
2371	int count = page_mapcount(page);
2372
2373	md->pages++;
2374	if (pte_dirty || PageDirty(page))
2375		md->dirty++;
2376
2377	if (PageSwapCache(page))
2378		md->swapcache++;
2379
2380	if (PageActive(page) || PageUnevictable(page))
2381		md->active++;
2382
2383	if (PageWriteback(page))
2384		md->writeback++;
2385
2386	if (PageAnon(page))
2387		md->anon++;
2388
2389	if (count > md->mapcount_max)
2390		md->mapcount_max = count;
2391
2392	md->node[page_to_nid(page)]++;
2393}
2394
2395#ifdef CONFIG_HUGETLB_PAGE
2396static void check_huge_range(struct vm_area_struct *vma,
2397		unsigned long start, unsigned long end,
2398		struct numa_maps *md)
2399{
2400	unsigned long addr;
2401	struct page *page;
2402	struct hstate *h = hstate_vma(vma);
2403	unsigned long sz = huge_page_size(h);
2404
2405	for (addr = start; addr < end; addr += sz) {
2406		pte_t *ptep = huge_pte_offset(vma->vm_mm,
2407						addr & huge_page_mask(h));
2408		pte_t pte;
2409
2410		if (!ptep)
2411			continue;
2412
2413		pte = *ptep;
2414		if (pte_none(pte))
2415			continue;
2416
2417		page = pte_page(pte);
2418		if (!page)
2419			continue;
2420
2421		gather_stats(page, md, pte_dirty(*ptep));
2422	}
2423}
2424#else
2425static inline void check_huge_range(struct vm_area_struct *vma,
2426		unsigned long start, unsigned long end,
2427		struct numa_maps *md)
2428{
2429}
2430#endif
2431
2432/*
2433 * Display pages allocated per node and memory policy via /proc.
2434 */
2435int show_numa_map(struct seq_file *m, void *v)
2436{
2437	struct proc_maps_private *priv = m->private;
2438	struct vm_area_struct *vma = v;
2439	struct numa_maps *md;
2440	struct file *file = vma->vm_file;
2441	struct mm_struct *mm = vma->vm_mm;
2442	struct mempolicy *pol;
2443	int n;
2444	char buffer[50];
2445
2446	if (!mm)
2447		return 0;
2448
2449	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2450	if (!md)
2451		return 0;
2452
2453	pol = get_vma_policy(priv->task, vma, vma->vm_start);
2454	mpol_to_str(buffer, sizeof(buffer), pol, 0);
2455	mpol_cond_put(pol);
2456
2457	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2458
2459	if (file) {
2460		seq_printf(m, " file=");
2461		seq_path(m, &file->f_path, "\n\t= ");
2462	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2463		seq_printf(m, " heap");
2464	} else if (vma->vm_start <= mm->start_stack &&
2465			vma->vm_end >= mm->start_stack) {
2466		seq_printf(m, " stack");
2467	}
2468
2469	if (is_vm_hugetlb_page(vma)) {
2470		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2471		seq_printf(m, " huge");
2472	} else {
2473		check_pgd_range(vma, vma->vm_start, vma->vm_end,
2474			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2475	}
2476
2477	if (!md->pages)
2478		goto out;
2479
2480	if (md->anon)
2481		seq_printf(m," anon=%lu",md->anon);
2482
2483	if (md->dirty)
2484		seq_printf(m," dirty=%lu",md->dirty);
2485
2486	if (md->pages != md->anon && md->pages != md->dirty)
2487		seq_printf(m, " mapped=%lu", md->pages);
2488
2489	if (md->mapcount_max > 1)
2490		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2491
2492	if (md->swapcache)
2493		seq_printf(m," swapcache=%lu", md->swapcache);
2494
2495	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2496		seq_printf(m," active=%lu", md->active);
2497
2498	if (md->writeback)
2499		seq_printf(m," writeback=%lu", md->writeback);
2500
2501	for_each_node_state(n, N_HIGH_MEMORY)
2502		if (md->node[n])
2503			seq_printf(m, " N%d=%lu", n, md->node[n]);
2504out:
2505	seq_putc(m, '\n');
2506	kfree(md);
2507
2508	if (m->count < m->size)
2509		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2510	return 0;
2511}
2512