mempolicy.c revision b05ca7385a2848abdc72051f832722641daed8b0
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave     Allocate memory interleaved over a set of nodes,
16 *                with normal fallback if it fails.
17 *                For VMA based allocations this interleaves based on the
18 *                offset into the backing object or offset into the mapping
19 *                for anonymous memory. For process policy an process counter
20 *                is used.
21 *
22 * bind           Only allocate memory on a specific set of nodes,
23 *                no fallback.
24 *                FIXME: memory is allocated starting with the first node
25 *                to the last. It would be better if bind would truly restrict
26 *                the allocation to memory nodes instead
27 *
28 * preferred       Try a specific node first before normal fallback.
29 *                As a special case node -1 here means do the allocation
30 *                on the local CPU. This is normally identical to default,
31 *                but useful to set in a VMA when you have a non default
32 *                process policy.
33 *
34 * default        Allocate on the local node first, or when on a VMA
35 *                use the process policy. This is what Linux always did
36 *		  in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57   fix mmap readahead to honour policy and enable policy for any page cache
58   object
59   statistics for bigpages
60   global policy for page cache? currently it uses process policy. Requires
61   first item above.
62   handle mremap for shared memory (currently ignored for the policy)
63   grows down?
64   make bind policy root only? It can trigger oom much faster and the
65   kernel is not always grateful with that.
66*/
67
68#include <linux/mempolicy.h>
69#include <linux/mm.h>
70#include <linux/highmem.h>
71#include <linux/hugetlb.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
74#include <linux/nodemask.h>
75#include <linux/cpuset.h>
76#include <linux/gfp.h>
77#include <linux/slab.h>
78#include <linux/string.h>
79#include <linux/module.h>
80#include <linux/nsproxy.h>
81#include <linux/interrupt.h>
82#include <linux/init.h>
83#include <linux/compat.h>
84#include <linux/swap.h>
85#include <linux/seq_file.h>
86#include <linux/proc_fs.h>
87#include <linux/migrate.h>
88#include <linux/rmap.h>
89#include <linux/security.h>
90#include <linux/syscalls.h>
91#include <linux/ctype.h>
92
93#include <asm/tlbflush.h>
94#include <asm/uaccess.h>
95
96#include "internal.h"
97
98/* Internal flags */
99#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
100#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
101#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
102
103static struct kmem_cache *policy_cache;
104static struct kmem_cache *sn_cache;
105
106/* Highest zone. An specific allocation for a zone below that is not
107   policied. */
108enum zone_type policy_zone = 0;
109
110/*
111 * run-time system-wide default policy => local allocation
112 */
113struct mempolicy default_policy = {
114	.refcnt = ATOMIC_INIT(1), /* never free it */
115	.mode = MPOL_PREFERRED,
116	.flags = MPOL_F_LOCAL,
117};
118
119static const struct mempolicy_operations {
120	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
121	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
122} mpol_ops[MPOL_MAX];
123
124/* Check that the nodemask contains at least one populated zone */
125static int is_valid_nodemask(const nodemask_t *nodemask)
126{
127	int nd, k;
128
129	/* Check that there is something useful in this mask */
130	k = policy_zone;
131
132	for_each_node_mask(nd, *nodemask) {
133		struct zone *z;
134
135		for (k = 0; k <= policy_zone; k++) {
136			z = &NODE_DATA(nd)->node_zones[k];
137			if (z->present_pages > 0)
138				return 1;
139		}
140	}
141
142	return 0;
143}
144
145static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
146{
147	return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
148}
149
150static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
151				   const nodemask_t *rel)
152{
153	nodemask_t tmp;
154	nodes_fold(tmp, *orig, nodes_weight(*rel));
155	nodes_onto(*ret, tmp, *rel);
156}
157
158static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
159{
160	if (nodes_empty(*nodes))
161		return -EINVAL;
162	pol->v.nodes = *nodes;
163	return 0;
164}
165
166static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
167{
168	if (!nodes)
169		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
170	else if (nodes_empty(*nodes))
171		return -EINVAL;			/*  no allowed nodes */
172	else
173		pol->v.preferred_node = first_node(*nodes);
174	return 0;
175}
176
177static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
178{
179	if (!is_valid_nodemask(nodes))
180		return -EINVAL;
181	pol->v.nodes = *nodes;
182	return 0;
183}
184
185/*
186 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
187 * any, for the new policy.  mpol_new() has already validated the nodes
188 * parameter with respect to the policy mode and flags.  But, we need to
189 * handle an empty nodemask with MPOL_PREFERRED here.
190 *
191 * Must be called holding task's alloc_lock to protect task's mems_allowed
192 * and mempolicy.  May also be called holding the mmap_semaphore for write.
193 */
194static int mpol_set_nodemask(struct mempolicy *pol,
195		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
196{
197	int ret;
198
199	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
200	if (pol == NULL)
201		return 0;
202	/* Check N_HIGH_MEMORY */
203	nodes_and(nsc->mask1,
204		  cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
205
206	VM_BUG_ON(!nodes);
207	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
208		nodes = NULL;	/* explicit local allocation */
209	else {
210		if (pol->flags & MPOL_F_RELATIVE_NODES)
211			mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
212		else
213			nodes_and(nsc->mask2, *nodes, nsc->mask1);
214
215		if (mpol_store_user_nodemask(pol))
216			pol->w.user_nodemask = *nodes;
217		else
218			pol->w.cpuset_mems_allowed =
219						cpuset_current_mems_allowed;
220	}
221
222	if (nodes)
223		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
224	else
225		ret = mpol_ops[pol->mode].create(pol, NULL);
226	return ret;
227}
228
229/*
230 * This function just creates a new policy, does some check and simple
231 * initialization. You must invoke mpol_set_nodemask() to set nodes.
232 */
233static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
234				  nodemask_t *nodes)
235{
236	struct mempolicy *policy;
237
238	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
239		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
240
241	if (mode == MPOL_DEFAULT) {
242		if (nodes && !nodes_empty(*nodes))
243			return ERR_PTR(-EINVAL);
244		return NULL;	/* simply delete any existing policy */
245	}
246	VM_BUG_ON(!nodes);
247
248	/*
249	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
250	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
251	 * All other modes require a valid pointer to a non-empty nodemask.
252	 */
253	if (mode == MPOL_PREFERRED) {
254		if (nodes_empty(*nodes)) {
255			if (((flags & MPOL_F_STATIC_NODES) ||
256			     (flags & MPOL_F_RELATIVE_NODES)))
257				return ERR_PTR(-EINVAL);
258		}
259	} else if (nodes_empty(*nodes))
260		return ERR_PTR(-EINVAL);
261	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
262	if (!policy)
263		return ERR_PTR(-ENOMEM);
264	atomic_set(&policy->refcnt, 1);
265	policy->mode = mode;
266	policy->flags = flags;
267
268	return policy;
269}
270
271/* Slow path of a mpol destructor. */
272void __mpol_put(struct mempolicy *p)
273{
274	if (!atomic_dec_and_test(&p->refcnt))
275		return;
276	kmem_cache_free(policy_cache, p);
277}
278
279static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
280{
281}
282
283static void mpol_rebind_nodemask(struct mempolicy *pol,
284				 const nodemask_t *nodes)
285{
286	nodemask_t tmp;
287
288	if (pol->flags & MPOL_F_STATIC_NODES)
289		nodes_and(tmp, pol->w.user_nodemask, *nodes);
290	else if (pol->flags & MPOL_F_RELATIVE_NODES)
291		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
292	else {
293		nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
294			    *nodes);
295		pol->w.cpuset_mems_allowed = *nodes;
296	}
297
298	pol->v.nodes = tmp;
299	if (!node_isset(current->il_next, tmp)) {
300		current->il_next = next_node(current->il_next, tmp);
301		if (current->il_next >= MAX_NUMNODES)
302			current->il_next = first_node(tmp);
303		if (current->il_next >= MAX_NUMNODES)
304			current->il_next = numa_node_id();
305	}
306}
307
308static void mpol_rebind_preferred(struct mempolicy *pol,
309				  const nodemask_t *nodes)
310{
311	nodemask_t tmp;
312
313	if (pol->flags & MPOL_F_STATIC_NODES) {
314		int node = first_node(pol->w.user_nodemask);
315
316		if (node_isset(node, *nodes)) {
317			pol->v.preferred_node = node;
318			pol->flags &= ~MPOL_F_LOCAL;
319		} else
320			pol->flags |= MPOL_F_LOCAL;
321	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
322		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
323		pol->v.preferred_node = first_node(tmp);
324	} else if (!(pol->flags & MPOL_F_LOCAL)) {
325		pol->v.preferred_node = node_remap(pol->v.preferred_node,
326						   pol->w.cpuset_mems_allowed,
327						   *nodes);
328		pol->w.cpuset_mems_allowed = *nodes;
329	}
330}
331
332/* Migrate a policy to a different set of nodes */
333static void mpol_rebind_policy(struct mempolicy *pol,
334			       const nodemask_t *newmask)
335{
336	if (!pol)
337		return;
338	if (!mpol_store_user_nodemask(pol) &&
339	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
340		return;
341	mpol_ops[pol->mode].rebind(pol, newmask);
342}
343
344/*
345 * Wrapper for mpol_rebind_policy() that just requires task
346 * pointer, and updates task mempolicy.
347 *
348 * Called with task's alloc_lock held.
349 */
350
351void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
352{
353	mpol_rebind_policy(tsk->mempolicy, new);
354}
355
356/*
357 * Rebind each vma in mm to new nodemask.
358 *
359 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
360 */
361
362void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
363{
364	struct vm_area_struct *vma;
365
366	down_write(&mm->mmap_sem);
367	for (vma = mm->mmap; vma; vma = vma->vm_next)
368		mpol_rebind_policy(vma->vm_policy, new);
369	up_write(&mm->mmap_sem);
370}
371
372static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
373	[MPOL_DEFAULT] = {
374		.rebind = mpol_rebind_default,
375	},
376	[MPOL_INTERLEAVE] = {
377		.create = mpol_new_interleave,
378		.rebind = mpol_rebind_nodemask,
379	},
380	[MPOL_PREFERRED] = {
381		.create = mpol_new_preferred,
382		.rebind = mpol_rebind_preferred,
383	},
384	[MPOL_BIND] = {
385		.create = mpol_new_bind,
386		.rebind = mpol_rebind_nodemask,
387	},
388};
389
390static void gather_stats(struct page *, void *, int pte_dirty);
391static void migrate_page_add(struct page *page, struct list_head *pagelist,
392				unsigned long flags);
393
394/* Scan through pages checking if pages follow certain conditions. */
395static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
396		unsigned long addr, unsigned long end,
397		const nodemask_t *nodes, unsigned long flags,
398		void *private)
399{
400	pte_t *orig_pte;
401	pte_t *pte;
402	spinlock_t *ptl;
403
404	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
405	do {
406		struct page *page;
407		int nid;
408
409		if (!pte_present(*pte))
410			continue;
411		page = vm_normal_page(vma, addr, *pte);
412		if (!page)
413			continue;
414		/*
415		 * The check for PageReserved here is important to avoid
416		 * handling zero pages and other pages that may have been
417		 * marked special by the system.
418		 *
419		 * If the PageReserved would not be checked here then f.e.
420		 * the location of the zero page could have an influence
421		 * on MPOL_MF_STRICT, zero pages would be counted for
422		 * the per node stats, and there would be useless attempts
423		 * to put zero pages on the migration list.
424		 */
425		if (PageReserved(page))
426			continue;
427		nid = page_to_nid(page);
428		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
429			continue;
430
431		if (flags & MPOL_MF_STATS)
432			gather_stats(page, private, pte_dirty(*pte));
433		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
434			migrate_page_add(page, private, flags);
435		else
436			break;
437	} while (pte++, addr += PAGE_SIZE, addr != end);
438	pte_unmap_unlock(orig_pte, ptl);
439	return addr != end;
440}
441
442static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
443		unsigned long addr, unsigned long end,
444		const nodemask_t *nodes, unsigned long flags,
445		void *private)
446{
447	pmd_t *pmd;
448	unsigned long next;
449
450	pmd = pmd_offset(pud, addr);
451	do {
452		next = pmd_addr_end(addr, end);
453		if (pmd_none_or_clear_bad(pmd))
454			continue;
455		if (check_pte_range(vma, pmd, addr, next, nodes,
456				    flags, private))
457			return -EIO;
458	} while (pmd++, addr = next, addr != end);
459	return 0;
460}
461
462static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
463		unsigned long addr, unsigned long end,
464		const nodemask_t *nodes, unsigned long flags,
465		void *private)
466{
467	pud_t *pud;
468	unsigned long next;
469
470	pud = pud_offset(pgd, addr);
471	do {
472		next = pud_addr_end(addr, end);
473		if (pud_none_or_clear_bad(pud))
474			continue;
475		if (check_pmd_range(vma, pud, addr, next, nodes,
476				    flags, private))
477			return -EIO;
478	} while (pud++, addr = next, addr != end);
479	return 0;
480}
481
482static inline int check_pgd_range(struct vm_area_struct *vma,
483		unsigned long addr, unsigned long end,
484		const nodemask_t *nodes, unsigned long flags,
485		void *private)
486{
487	pgd_t *pgd;
488	unsigned long next;
489
490	pgd = pgd_offset(vma->vm_mm, addr);
491	do {
492		next = pgd_addr_end(addr, end);
493		if (pgd_none_or_clear_bad(pgd))
494			continue;
495		if (check_pud_range(vma, pgd, addr, next, nodes,
496				    flags, private))
497			return -EIO;
498	} while (pgd++, addr = next, addr != end);
499	return 0;
500}
501
502/*
503 * Check if all pages in a range are on a set of nodes.
504 * If pagelist != NULL then isolate pages from the LRU and
505 * put them on the pagelist.
506 */
507static struct vm_area_struct *
508check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
509		const nodemask_t *nodes, unsigned long flags, void *private)
510{
511	int err;
512	struct vm_area_struct *first, *vma, *prev;
513
514
515	first = find_vma(mm, start);
516	if (!first)
517		return ERR_PTR(-EFAULT);
518	prev = NULL;
519	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
520		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
521			if (!vma->vm_next && vma->vm_end < end)
522				return ERR_PTR(-EFAULT);
523			if (prev && prev->vm_end < vma->vm_start)
524				return ERR_PTR(-EFAULT);
525		}
526		if (!is_vm_hugetlb_page(vma) &&
527		    ((flags & MPOL_MF_STRICT) ||
528		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
529				vma_migratable(vma)))) {
530			unsigned long endvma = vma->vm_end;
531
532			if (endvma > end)
533				endvma = end;
534			if (vma->vm_start > start)
535				start = vma->vm_start;
536			err = check_pgd_range(vma, start, endvma, nodes,
537						flags, private);
538			if (err) {
539				first = ERR_PTR(err);
540				break;
541			}
542		}
543		prev = vma;
544	}
545	return first;
546}
547
548/* Apply policy to a single VMA */
549static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
550{
551	int err = 0;
552	struct mempolicy *old = vma->vm_policy;
553
554	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
555		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
556		 vma->vm_ops, vma->vm_file,
557		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
558
559	if (vma->vm_ops && vma->vm_ops->set_policy)
560		err = vma->vm_ops->set_policy(vma, new);
561	if (!err) {
562		mpol_get(new);
563		vma->vm_policy = new;
564		mpol_put(old);
565	}
566	return err;
567}
568
569/* Step 2: apply policy to a range and do splits. */
570static int mbind_range(struct vm_area_struct *vma, unsigned long start,
571		       unsigned long end, struct mempolicy *new)
572{
573	struct vm_area_struct *next;
574	int err;
575
576	err = 0;
577	for (; vma && vma->vm_start < end; vma = next) {
578		next = vma->vm_next;
579		if (vma->vm_start < start)
580			err = split_vma(vma->vm_mm, vma, start, 1);
581		if (!err && vma->vm_end > end)
582			err = split_vma(vma->vm_mm, vma, end, 0);
583		if (!err)
584			err = policy_vma(vma, new);
585		if (err)
586			break;
587	}
588	return err;
589}
590
591/*
592 * Update task->flags PF_MEMPOLICY bit: set iff non-default
593 * mempolicy.  Allows more rapid checking of this (combined perhaps
594 * with other PF_* flag bits) on memory allocation hot code paths.
595 *
596 * If called from outside this file, the task 'p' should -only- be
597 * a newly forked child not yet visible on the task list, because
598 * manipulating the task flags of a visible task is not safe.
599 *
600 * The above limitation is why this routine has the funny name
601 * mpol_fix_fork_child_flag().
602 *
603 * It is also safe to call this with a task pointer of current,
604 * which the static wrapper mpol_set_task_struct_flag() does,
605 * for use within this file.
606 */
607
608void mpol_fix_fork_child_flag(struct task_struct *p)
609{
610	if (p->mempolicy)
611		p->flags |= PF_MEMPOLICY;
612	else
613		p->flags &= ~PF_MEMPOLICY;
614}
615
616static void mpol_set_task_struct_flag(void)
617{
618	mpol_fix_fork_child_flag(current);
619}
620
621/* Set the process memory policy */
622static long do_set_mempolicy(unsigned short mode, unsigned short flags,
623			     nodemask_t *nodes)
624{
625	struct mempolicy *new, *old;
626	struct mm_struct *mm = current->mm;
627	NODEMASK_SCRATCH(scratch);
628	int ret;
629
630	if (!scratch)
631		return -ENOMEM;
632
633	new = mpol_new(mode, flags, nodes);
634	if (IS_ERR(new)) {
635		ret = PTR_ERR(new);
636		goto out;
637	}
638	/*
639	 * prevent changing our mempolicy while show_numa_maps()
640	 * is using it.
641	 * Note:  do_set_mempolicy() can be called at init time
642	 * with no 'mm'.
643	 */
644	if (mm)
645		down_write(&mm->mmap_sem);
646	task_lock(current);
647	ret = mpol_set_nodemask(new, nodes, scratch);
648	if (ret) {
649		task_unlock(current);
650		if (mm)
651			up_write(&mm->mmap_sem);
652		mpol_put(new);
653		goto out;
654	}
655	old = current->mempolicy;
656	current->mempolicy = new;
657	mpol_set_task_struct_flag();
658	if (new && new->mode == MPOL_INTERLEAVE &&
659	    nodes_weight(new->v.nodes))
660		current->il_next = first_node(new->v.nodes);
661	task_unlock(current);
662	if (mm)
663		up_write(&mm->mmap_sem);
664
665	mpol_put(old);
666	ret = 0;
667out:
668	NODEMASK_SCRATCH_FREE(scratch);
669	return ret;
670}
671
672/*
673 * Return nodemask for policy for get_mempolicy() query
674 *
675 * Called with task's alloc_lock held
676 */
677static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
678{
679	nodes_clear(*nodes);
680	if (p == &default_policy)
681		return;
682
683	switch (p->mode) {
684	case MPOL_BIND:
685		/* Fall through */
686	case MPOL_INTERLEAVE:
687		*nodes = p->v.nodes;
688		break;
689	case MPOL_PREFERRED:
690		if (!(p->flags & MPOL_F_LOCAL))
691			node_set(p->v.preferred_node, *nodes);
692		/* else return empty node mask for local allocation */
693		break;
694	default:
695		BUG();
696	}
697}
698
699static int lookup_node(struct mm_struct *mm, unsigned long addr)
700{
701	struct page *p;
702	int err;
703
704	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
705	if (err >= 0) {
706		err = page_to_nid(p);
707		put_page(p);
708	}
709	return err;
710}
711
712/* Retrieve NUMA policy */
713static long do_get_mempolicy(int *policy, nodemask_t *nmask,
714			     unsigned long addr, unsigned long flags)
715{
716	int err;
717	struct mm_struct *mm = current->mm;
718	struct vm_area_struct *vma = NULL;
719	struct mempolicy *pol = current->mempolicy;
720
721	if (flags &
722		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
723		return -EINVAL;
724
725	if (flags & MPOL_F_MEMS_ALLOWED) {
726		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
727			return -EINVAL;
728		*policy = 0;	/* just so it's initialized */
729		task_lock(current);
730		*nmask  = cpuset_current_mems_allowed;
731		task_unlock(current);
732		return 0;
733	}
734
735	if (flags & MPOL_F_ADDR) {
736		/*
737		 * Do NOT fall back to task policy if the
738		 * vma/shared policy at addr is NULL.  We
739		 * want to return MPOL_DEFAULT in this case.
740		 */
741		down_read(&mm->mmap_sem);
742		vma = find_vma_intersection(mm, addr, addr+1);
743		if (!vma) {
744			up_read(&mm->mmap_sem);
745			return -EFAULT;
746		}
747		if (vma->vm_ops && vma->vm_ops->get_policy)
748			pol = vma->vm_ops->get_policy(vma, addr);
749		else
750			pol = vma->vm_policy;
751	} else if (addr)
752		return -EINVAL;
753
754	if (!pol)
755		pol = &default_policy;	/* indicates default behavior */
756
757	if (flags & MPOL_F_NODE) {
758		if (flags & MPOL_F_ADDR) {
759			err = lookup_node(mm, addr);
760			if (err < 0)
761				goto out;
762			*policy = err;
763		} else if (pol == current->mempolicy &&
764				pol->mode == MPOL_INTERLEAVE) {
765			*policy = current->il_next;
766		} else {
767			err = -EINVAL;
768			goto out;
769		}
770	} else {
771		*policy = pol == &default_policy ? MPOL_DEFAULT :
772						pol->mode;
773		/*
774		 * Internal mempolicy flags must be masked off before exposing
775		 * the policy to userspace.
776		 */
777		*policy |= (pol->flags & MPOL_MODE_FLAGS);
778	}
779
780	if (vma) {
781		up_read(&current->mm->mmap_sem);
782		vma = NULL;
783	}
784
785	err = 0;
786	if (nmask) {
787		task_lock(current);
788		get_policy_nodemask(pol, nmask);
789		task_unlock(current);
790	}
791
792 out:
793	mpol_cond_put(pol);
794	if (vma)
795		up_read(&current->mm->mmap_sem);
796	return err;
797}
798
799#ifdef CONFIG_MIGRATION
800/*
801 * page migration
802 */
803static void migrate_page_add(struct page *page, struct list_head *pagelist,
804				unsigned long flags)
805{
806	/*
807	 * Avoid migrating a page that is shared with others.
808	 */
809	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
810		if (!isolate_lru_page(page)) {
811			list_add_tail(&page->lru, pagelist);
812		}
813	}
814}
815
816static struct page *new_node_page(struct page *page, unsigned long node, int **x)
817{
818	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
819}
820
821/*
822 * Migrate pages from one node to a target node.
823 * Returns error or the number of pages not migrated.
824 */
825static int migrate_to_node(struct mm_struct *mm, int source, int dest,
826			   int flags)
827{
828	nodemask_t nmask;
829	LIST_HEAD(pagelist);
830	int err = 0;
831
832	nodes_clear(nmask);
833	node_set(source, nmask);
834
835	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
836			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
837
838	if (!list_empty(&pagelist))
839		err = migrate_pages(&pagelist, new_node_page, dest);
840
841	return err;
842}
843
844/*
845 * Move pages between the two nodesets so as to preserve the physical
846 * layout as much as possible.
847 *
848 * Returns the number of page that could not be moved.
849 */
850int do_migrate_pages(struct mm_struct *mm,
851	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
852{
853	int busy = 0;
854	int err;
855	nodemask_t tmp;
856
857	err = migrate_prep();
858	if (err)
859		return err;
860
861	down_read(&mm->mmap_sem);
862
863	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
864	if (err)
865		goto out;
866
867/*
868 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
869 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
870 * bit in 'tmp', and return that <source, dest> pair for migration.
871 * The pair of nodemasks 'to' and 'from' define the map.
872 *
873 * If no pair of bits is found that way, fallback to picking some
874 * pair of 'source' and 'dest' bits that are not the same.  If the
875 * 'source' and 'dest' bits are the same, this represents a node
876 * that will be migrating to itself, so no pages need move.
877 *
878 * If no bits are left in 'tmp', or if all remaining bits left
879 * in 'tmp' correspond to the same bit in 'to', return false
880 * (nothing left to migrate).
881 *
882 * This lets us pick a pair of nodes to migrate between, such that
883 * if possible the dest node is not already occupied by some other
884 * source node, minimizing the risk of overloading the memory on a
885 * node that would happen if we migrated incoming memory to a node
886 * before migrating outgoing memory source that same node.
887 *
888 * A single scan of tmp is sufficient.  As we go, we remember the
889 * most recent <s, d> pair that moved (s != d).  If we find a pair
890 * that not only moved, but what's better, moved to an empty slot
891 * (d is not set in tmp), then we break out then, with that pair.
892 * Otherwise when we finish scannng from_tmp, we at least have the
893 * most recent <s, d> pair that moved.  If we get all the way through
894 * the scan of tmp without finding any node that moved, much less
895 * moved to an empty node, then there is nothing left worth migrating.
896 */
897
898	tmp = *from_nodes;
899	while (!nodes_empty(tmp)) {
900		int s,d;
901		int source = -1;
902		int dest = 0;
903
904		for_each_node_mask(s, tmp) {
905			d = node_remap(s, *from_nodes, *to_nodes);
906			if (s == d)
907				continue;
908
909			source = s;	/* Node moved. Memorize */
910			dest = d;
911
912			/* dest not in remaining from nodes? */
913			if (!node_isset(dest, tmp))
914				break;
915		}
916		if (source == -1)
917			break;
918
919		node_clear(source, tmp);
920		err = migrate_to_node(mm, source, dest, flags);
921		if (err > 0)
922			busy += err;
923		if (err < 0)
924			break;
925	}
926out:
927	up_read(&mm->mmap_sem);
928	if (err < 0)
929		return err;
930	return busy;
931
932}
933
934/*
935 * Allocate a new page for page migration based on vma policy.
936 * Start assuming that page is mapped by vma pointed to by @private.
937 * Search forward from there, if not.  N.B., this assumes that the
938 * list of pages handed to migrate_pages()--which is how we get here--
939 * is in virtual address order.
940 */
941static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
942{
943	struct vm_area_struct *vma = (struct vm_area_struct *)private;
944	unsigned long uninitialized_var(address);
945
946	while (vma) {
947		address = page_address_in_vma(page, vma);
948		if (address != -EFAULT)
949			break;
950		vma = vma->vm_next;
951	}
952
953	/*
954	 * if !vma, alloc_page_vma() will use task or system default policy
955	 */
956	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
957}
958#else
959
960static void migrate_page_add(struct page *page, struct list_head *pagelist,
961				unsigned long flags)
962{
963}
964
965int do_migrate_pages(struct mm_struct *mm,
966	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
967{
968	return -ENOSYS;
969}
970
971static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
972{
973	return NULL;
974}
975#endif
976
977static long do_mbind(unsigned long start, unsigned long len,
978		     unsigned short mode, unsigned short mode_flags,
979		     nodemask_t *nmask, unsigned long flags)
980{
981	struct vm_area_struct *vma;
982	struct mm_struct *mm = current->mm;
983	struct mempolicy *new;
984	unsigned long end;
985	int err;
986	LIST_HEAD(pagelist);
987
988	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
989				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
990		return -EINVAL;
991	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
992		return -EPERM;
993
994	if (start & ~PAGE_MASK)
995		return -EINVAL;
996
997	if (mode == MPOL_DEFAULT)
998		flags &= ~MPOL_MF_STRICT;
999
1000	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1001	end = start + len;
1002
1003	if (end < start)
1004		return -EINVAL;
1005	if (end == start)
1006		return 0;
1007
1008	new = mpol_new(mode, mode_flags, nmask);
1009	if (IS_ERR(new))
1010		return PTR_ERR(new);
1011
1012	/*
1013	 * If we are using the default policy then operation
1014	 * on discontinuous address spaces is okay after all
1015	 */
1016	if (!new)
1017		flags |= MPOL_MF_DISCONTIG_OK;
1018
1019	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1020		 start, start + len, mode, mode_flags,
1021		 nmask ? nodes_addr(*nmask)[0] : -1);
1022
1023	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1024
1025		err = migrate_prep();
1026		if (err)
1027			goto mpol_out;
1028	}
1029	{
1030		NODEMASK_SCRATCH(scratch);
1031		if (scratch) {
1032			down_write(&mm->mmap_sem);
1033			task_lock(current);
1034			err = mpol_set_nodemask(new, nmask, scratch);
1035			task_unlock(current);
1036			if (err)
1037				up_write(&mm->mmap_sem);
1038		} else
1039			err = -ENOMEM;
1040		NODEMASK_SCRATCH_FREE(scratch);
1041	}
1042	if (err)
1043		goto mpol_out;
1044
1045	vma = check_range(mm, start, end, nmask,
1046			  flags | MPOL_MF_INVERT, &pagelist);
1047
1048	err = PTR_ERR(vma);
1049	if (!IS_ERR(vma)) {
1050		int nr_failed = 0;
1051
1052		err = mbind_range(vma, start, end, new);
1053
1054		if (!list_empty(&pagelist))
1055			nr_failed = migrate_pages(&pagelist, new_vma_page,
1056						(unsigned long)vma);
1057
1058		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1059			err = -EIO;
1060	} else
1061		putback_lru_pages(&pagelist);
1062
1063	up_write(&mm->mmap_sem);
1064 mpol_out:
1065	mpol_put(new);
1066	return err;
1067}
1068
1069/*
1070 * User space interface with variable sized bitmaps for nodelists.
1071 */
1072
1073/* Copy a node mask from user space. */
1074static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1075		     unsigned long maxnode)
1076{
1077	unsigned long k;
1078	unsigned long nlongs;
1079	unsigned long endmask;
1080
1081	--maxnode;
1082	nodes_clear(*nodes);
1083	if (maxnode == 0 || !nmask)
1084		return 0;
1085	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1086		return -EINVAL;
1087
1088	nlongs = BITS_TO_LONGS(maxnode);
1089	if ((maxnode % BITS_PER_LONG) == 0)
1090		endmask = ~0UL;
1091	else
1092		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1093
1094	/* When the user specified more nodes than supported just check
1095	   if the non supported part is all zero. */
1096	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1097		if (nlongs > PAGE_SIZE/sizeof(long))
1098			return -EINVAL;
1099		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1100			unsigned long t;
1101			if (get_user(t, nmask + k))
1102				return -EFAULT;
1103			if (k == nlongs - 1) {
1104				if (t & endmask)
1105					return -EINVAL;
1106			} else if (t)
1107				return -EINVAL;
1108		}
1109		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1110		endmask = ~0UL;
1111	}
1112
1113	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1114		return -EFAULT;
1115	nodes_addr(*nodes)[nlongs-1] &= endmask;
1116	return 0;
1117}
1118
1119/* Copy a kernel node mask to user space */
1120static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1121			      nodemask_t *nodes)
1122{
1123	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1124	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1125
1126	if (copy > nbytes) {
1127		if (copy > PAGE_SIZE)
1128			return -EINVAL;
1129		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1130			return -EFAULT;
1131		copy = nbytes;
1132	}
1133	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1134}
1135
1136SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1137		unsigned long, mode, unsigned long __user *, nmask,
1138		unsigned long, maxnode, unsigned, flags)
1139{
1140	nodemask_t nodes;
1141	int err;
1142	unsigned short mode_flags;
1143
1144	mode_flags = mode & MPOL_MODE_FLAGS;
1145	mode &= ~MPOL_MODE_FLAGS;
1146	if (mode >= MPOL_MAX)
1147		return -EINVAL;
1148	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1149	    (mode_flags & MPOL_F_RELATIVE_NODES))
1150		return -EINVAL;
1151	err = get_nodes(&nodes, nmask, maxnode);
1152	if (err)
1153		return err;
1154	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1155}
1156
1157/* Set the process memory policy */
1158SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1159		unsigned long, maxnode)
1160{
1161	int err;
1162	nodemask_t nodes;
1163	unsigned short flags;
1164
1165	flags = mode & MPOL_MODE_FLAGS;
1166	mode &= ~MPOL_MODE_FLAGS;
1167	if ((unsigned int)mode >= MPOL_MAX)
1168		return -EINVAL;
1169	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1170		return -EINVAL;
1171	err = get_nodes(&nodes, nmask, maxnode);
1172	if (err)
1173		return err;
1174	return do_set_mempolicy(mode, flags, &nodes);
1175}
1176
1177SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1178		const unsigned long __user *, old_nodes,
1179		const unsigned long __user *, new_nodes)
1180{
1181	const struct cred *cred = current_cred(), *tcred;
1182	struct mm_struct *mm;
1183	struct task_struct *task;
1184	nodemask_t old;
1185	nodemask_t new;
1186	nodemask_t task_nodes;
1187	int err;
1188
1189	err = get_nodes(&old, old_nodes, maxnode);
1190	if (err)
1191		return err;
1192
1193	err = get_nodes(&new, new_nodes, maxnode);
1194	if (err)
1195		return err;
1196
1197	/* Find the mm_struct */
1198	read_lock(&tasklist_lock);
1199	task = pid ? find_task_by_vpid(pid) : current;
1200	if (!task) {
1201		read_unlock(&tasklist_lock);
1202		return -ESRCH;
1203	}
1204	mm = get_task_mm(task);
1205	read_unlock(&tasklist_lock);
1206
1207	if (!mm)
1208		return -EINVAL;
1209
1210	/*
1211	 * Check if this process has the right to modify the specified
1212	 * process. The right exists if the process has administrative
1213	 * capabilities, superuser privileges or the same
1214	 * userid as the target process.
1215	 */
1216	rcu_read_lock();
1217	tcred = __task_cred(task);
1218	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1219	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
1220	    !capable(CAP_SYS_NICE)) {
1221		rcu_read_unlock();
1222		err = -EPERM;
1223		goto out;
1224	}
1225	rcu_read_unlock();
1226
1227	task_nodes = cpuset_mems_allowed(task);
1228	/* Is the user allowed to access the target nodes? */
1229	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1230		err = -EPERM;
1231		goto out;
1232	}
1233
1234	if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1235		err = -EINVAL;
1236		goto out;
1237	}
1238
1239	err = security_task_movememory(task);
1240	if (err)
1241		goto out;
1242
1243	err = do_migrate_pages(mm, &old, &new,
1244		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1245out:
1246	mmput(mm);
1247	return err;
1248}
1249
1250
1251/* Retrieve NUMA policy */
1252SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1253		unsigned long __user *, nmask, unsigned long, maxnode,
1254		unsigned long, addr, unsigned long, flags)
1255{
1256	int err;
1257	int uninitialized_var(pval);
1258	nodemask_t nodes;
1259
1260	if (nmask != NULL && maxnode < MAX_NUMNODES)
1261		return -EINVAL;
1262
1263	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1264
1265	if (err)
1266		return err;
1267
1268	if (policy && put_user(pval, policy))
1269		return -EFAULT;
1270
1271	if (nmask)
1272		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1273
1274	return err;
1275}
1276
1277#ifdef CONFIG_COMPAT
1278
1279asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1280				     compat_ulong_t __user *nmask,
1281				     compat_ulong_t maxnode,
1282				     compat_ulong_t addr, compat_ulong_t flags)
1283{
1284	long err;
1285	unsigned long __user *nm = NULL;
1286	unsigned long nr_bits, alloc_size;
1287	DECLARE_BITMAP(bm, MAX_NUMNODES);
1288
1289	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1290	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1291
1292	if (nmask)
1293		nm = compat_alloc_user_space(alloc_size);
1294
1295	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1296
1297	if (!err && nmask) {
1298		err = copy_from_user(bm, nm, alloc_size);
1299		/* ensure entire bitmap is zeroed */
1300		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1301		err |= compat_put_bitmap(nmask, bm, nr_bits);
1302	}
1303
1304	return err;
1305}
1306
1307asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1308				     compat_ulong_t maxnode)
1309{
1310	long err = 0;
1311	unsigned long __user *nm = NULL;
1312	unsigned long nr_bits, alloc_size;
1313	DECLARE_BITMAP(bm, MAX_NUMNODES);
1314
1315	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1316	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1317
1318	if (nmask) {
1319		err = compat_get_bitmap(bm, nmask, nr_bits);
1320		nm = compat_alloc_user_space(alloc_size);
1321		err |= copy_to_user(nm, bm, alloc_size);
1322	}
1323
1324	if (err)
1325		return -EFAULT;
1326
1327	return sys_set_mempolicy(mode, nm, nr_bits+1);
1328}
1329
1330asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1331			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1332			     compat_ulong_t maxnode, compat_ulong_t flags)
1333{
1334	long err = 0;
1335	unsigned long __user *nm = NULL;
1336	unsigned long nr_bits, alloc_size;
1337	nodemask_t bm;
1338
1339	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1340	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1341
1342	if (nmask) {
1343		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1344		nm = compat_alloc_user_space(alloc_size);
1345		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1346	}
1347
1348	if (err)
1349		return -EFAULT;
1350
1351	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1352}
1353
1354#endif
1355
1356/*
1357 * get_vma_policy(@task, @vma, @addr)
1358 * @task - task for fallback if vma policy == default
1359 * @vma   - virtual memory area whose policy is sought
1360 * @addr  - address in @vma for shared policy lookup
1361 *
1362 * Returns effective policy for a VMA at specified address.
1363 * Falls back to @task or system default policy, as necessary.
1364 * Current or other task's task mempolicy and non-shared vma policies
1365 * are protected by the task's mmap_sem, which must be held for read by
1366 * the caller.
1367 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1368 * count--added by the get_policy() vm_op, as appropriate--to protect against
1369 * freeing by another task.  It is the caller's responsibility to free the
1370 * extra reference for shared policies.
1371 */
1372static struct mempolicy *get_vma_policy(struct task_struct *task,
1373		struct vm_area_struct *vma, unsigned long addr)
1374{
1375	struct mempolicy *pol = task->mempolicy;
1376
1377	if (vma) {
1378		if (vma->vm_ops && vma->vm_ops->get_policy) {
1379			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1380									addr);
1381			if (vpol)
1382				pol = vpol;
1383		} else if (vma->vm_policy)
1384			pol = vma->vm_policy;
1385	}
1386	if (!pol)
1387		pol = &default_policy;
1388	return pol;
1389}
1390
1391/*
1392 * Return a nodemask representing a mempolicy for filtering nodes for
1393 * page allocation
1394 */
1395static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1396{
1397	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1398	if (unlikely(policy->mode == MPOL_BIND) &&
1399			gfp_zone(gfp) >= policy_zone &&
1400			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1401		return &policy->v.nodes;
1402
1403	return NULL;
1404}
1405
1406/* Return a zonelist indicated by gfp for node representing a mempolicy */
1407static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1408{
1409	int nd = numa_node_id();
1410
1411	switch (policy->mode) {
1412	case MPOL_PREFERRED:
1413		if (!(policy->flags & MPOL_F_LOCAL))
1414			nd = policy->v.preferred_node;
1415		break;
1416	case MPOL_BIND:
1417		/*
1418		 * Normally, MPOL_BIND allocations are node-local within the
1419		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1420		 * current node is part of the mask, we use the zonelist for
1421		 * the first node in the mask instead.
1422		 */
1423		if (unlikely(gfp & __GFP_THISNODE) &&
1424				unlikely(!node_isset(nd, policy->v.nodes)))
1425			nd = first_node(policy->v.nodes);
1426		break;
1427	case MPOL_INTERLEAVE: /* should not happen */
1428		break;
1429	default:
1430		BUG();
1431	}
1432	return node_zonelist(nd, gfp);
1433}
1434
1435/* Do dynamic interleaving for a process */
1436static unsigned interleave_nodes(struct mempolicy *policy)
1437{
1438	unsigned nid, next;
1439	struct task_struct *me = current;
1440
1441	nid = me->il_next;
1442	next = next_node(nid, policy->v.nodes);
1443	if (next >= MAX_NUMNODES)
1444		next = first_node(policy->v.nodes);
1445	if (next < MAX_NUMNODES)
1446		me->il_next = next;
1447	return nid;
1448}
1449
1450/*
1451 * Depending on the memory policy provide a node from which to allocate the
1452 * next slab entry.
1453 * @policy must be protected by freeing by the caller.  If @policy is
1454 * the current task's mempolicy, this protection is implicit, as only the
1455 * task can change it's policy.  The system default policy requires no
1456 * such protection.
1457 */
1458unsigned slab_node(struct mempolicy *policy)
1459{
1460	if (!policy || policy->flags & MPOL_F_LOCAL)
1461		return numa_node_id();
1462
1463	switch (policy->mode) {
1464	case MPOL_PREFERRED:
1465		/*
1466		 * handled MPOL_F_LOCAL above
1467		 */
1468		return policy->v.preferred_node;
1469
1470	case MPOL_INTERLEAVE:
1471		return interleave_nodes(policy);
1472
1473	case MPOL_BIND: {
1474		/*
1475		 * Follow bind policy behavior and start allocation at the
1476		 * first node.
1477		 */
1478		struct zonelist *zonelist;
1479		struct zone *zone;
1480		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1481		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1482		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1483							&policy->v.nodes,
1484							&zone);
1485		return zone->node;
1486	}
1487
1488	default:
1489		BUG();
1490	}
1491}
1492
1493/* Do static interleaving for a VMA with known offset. */
1494static unsigned offset_il_node(struct mempolicy *pol,
1495		struct vm_area_struct *vma, unsigned long off)
1496{
1497	unsigned nnodes = nodes_weight(pol->v.nodes);
1498	unsigned target;
1499	int c;
1500	int nid = -1;
1501
1502	if (!nnodes)
1503		return numa_node_id();
1504	target = (unsigned int)off % nnodes;
1505	c = 0;
1506	do {
1507		nid = next_node(nid, pol->v.nodes);
1508		c++;
1509	} while (c <= target);
1510	return nid;
1511}
1512
1513/* Determine a node number for interleave */
1514static inline unsigned interleave_nid(struct mempolicy *pol,
1515		 struct vm_area_struct *vma, unsigned long addr, int shift)
1516{
1517	if (vma) {
1518		unsigned long off;
1519
1520		/*
1521		 * for small pages, there is no difference between
1522		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1523		 * for huge pages, since vm_pgoff is in units of small
1524		 * pages, we need to shift off the always 0 bits to get
1525		 * a useful offset.
1526		 */
1527		BUG_ON(shift < PAGE_SHIFT);
1528		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1529		off += (addr - vma->vm_start) >> shift;
1530		return offset_il_node(pol, vma, off);
1531	} else
1532		return interleave_nodes(pol);
1533}
1534
1535#ifdef CONFIG_HUGETLBFS
1536/*
1537 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1538 * @vma = virtual memory area whose policy is sought
1539 * @addr = address in @vma for shared policy lookup and interleave policy
1540 * @gfp_flags = for requested zone
1541 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1542 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1543 *
1544 * Returns a zonelist suitable for a huge page allocation and a pointer
1545 * to the struct mempolicy for conditional unref after allocation.
1546 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1547 * @nodemask for filtering the zonelist.
1548 */
1549struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1550				gfp_t gfp_flags, struct mempolicy **mpol,
1551				nodemask_t **nodemask)
1552{
1553	struct zonelist *zl;
1554
1555	*mpol = get_vma_policy(current, vma, addr);
1556	*nodemask = NULL;	/* assume !MPOL_BIND */
1557
1558	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1559		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1560				huge_page_shift(hstate_vma(vma))), gfp_flags);
1561	} else {
1562		zl = policy_zonelist(gfp_flags, *mpol);
1563		if ((*mpol)->mode == MPOL_BIND)
1564			*nodemask = &(*mpol)->v.nodes;
1565	}
1566	return zl;
1567}
1568#endif
1569
1570/* Allocate a page in interleaved policy.
1571   Own path because it needs to do special accounting. */
1572static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1573					unsigned nid)
1574{
1575	struct zonelist *zl;
1576	struct page *page;
1577
1578	zl = node_zonelist(nid, gfp);
1579	page = __alloc_pages(gfp, order, zl);
1580	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1581		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1582	return page;
1583}
1584
1585/**
1586 * 	alloc_page_vma	- Allocate a page for a VMA.
1587 *
1588 * 	@gfp:
1589 *      %GFP_USER    user allocation.
1590 *      %GFP_KERNEL  kernel allocations,
1591 *      %GFP_HIGHMEM highmem/user allocations,
1592 *      %GFP_FS      allocation should not call back into a file system.
1593 *      %GFP_ATOMIC  don't sleep.
1594 *
1595 * 	@vma:  Pointer to VMA or NULL if not available.
1596 *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1597 *
1598 * 	This function allocates a page from the kernel page pool and applies
1599 *	a NUMA policy associated with the VMA or the current process.
1600 *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1601 *	mm_struct of the VMA to prevent it from going away. Should be used for
1602 *	all allocations for pages that will be mapped into
1603 * 	user space. Returns NULL when no page can be allocated.
1604 *
1605 *	Should be called with the mm_sem of the vma hold.
1606 */
1607struct page *
1608alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1609{
1610	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1611	struct zonelist *zl;
1612
1613	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1614		unsigned nid;
1615
1616		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1617		mpol_cond_put(pol);
1618		return alloc_page_interleave(gfp, 0, nid);
1619	}
1620	zl = policy_zonelist(gfp, pol);
1621	if (unlikely(mpol_needs_cond_ref(pol))) {
1622		/*
1623		 * slow path: ref counted shared policy
1624		 */
1625		struct page *page =  __alloc_pages_nodemask(gfp, 0,
1626						zl, policy_nodemask(gfp, pol));
1627		__mpol_put(pol);
1628		return page;
1629	}
1630	/*
1631	 * fast path:  default or task policy
1632	 */
1633	return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1634}
1635
1636/**
1637 * 	alloc_pages_current - Allocate pages.
1638 *
1639 *	@gfp:
1640 *		%GFP_USER   user allocation,
1641 *      	%GFP_KERNEL kernel allocation,
1642 *      	%GFP_HIGHMEM highmem allocation,
1643 *      	%GFP_FS     don't call back into a file system.
1644 *      	%GFP_ATOMIC don't sleep.
1645 *	@order: Power of two of allocation size in pages. 0 is a single page.
1646 *
1647 *	Allocate a page from the kernel page pool.  When not in
1648 *	interrupt context and apply the current process NUMA policy.
1649 *	Returns NULL when no page can be allocated.
1650 *
1651 *	Don't call cpuset_update_task_memory_state() unless
1652 *	1) it's ok to take cpuset_sem (can WAIT), and
1653 *	2) allocating for current task (not interrupt).
1654 */
1655struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1656{
1657	struct mempolicy *pol = current->mempolicy;
1658
1659	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1660		pol = &default_policy;
1661
1662	/*
1663	 * No reference counting needed for current->mempolicy
1664	 * nor system default_policy
1665	 */
1666	if (pol->mode == MPOL_INTERLEAVE)
1667		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1668	return __alloc_pages_nodemask(gfp, order,
1669			policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1670}
1671EXPORT_SYMBOL(alloc_pages_current);
1672
1673/*
1674 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1675 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1676 * with the mems_allowed returned by cpuset_mems_allowed().  This
1677 * keeps mempolicies cpuset relative after its cpuset moves.  See
1678 * further kernel/cpuset.c update_nodemask().
1679 */
1680
1681/* Slow path of a mempolicy duplicate */
1682struct mempolicy *__mpol_dup(struct mempolicy *old)
1683{
1684	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1685
1686	if (!new)
1687		return ERR_PTR(-ENOMEM);
1688	if (current_cpuset_is_being_rebound()) {
1689		nodemask_t mems = cpuset_mems_allowed(current);
1690		mpol_rebind_policy(old, &mems);
1691	}
1692	*new = *old;
1693	atomic_set(&new->refcnt, 1);
1694	return new;
1695}
1696
1697/*
1698 * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1699 * eliminate the * MPOL_F_* flags that require conditional ref and
1700 * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1701 * after return.  Use the returned value.
1702 *
1703 * Allows use of a mempolicy for, e.g., multiple allocations with a single
1704 * policy lookup, even if the policy needs/has extra ref on lookup.
1705 * shmem_readahead needs this.
1706 */
1707struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1708						struct mempolicy *frompol)
1709{
1710	if (!mpol_needs_cond_ref(frompol))
1711		return frompol;
1712
1713	*tompol = *frompol;
1714	tompol->flags &= ~MPOL_F_SHARED;	/* copy doesn't need unref */
1715	__mpol_put(frompol);
1716	return tompol;
1717}
1718
1719static int mpol_match_intent(const struct mempolicy *a,
1720			     const struct mempolicy *b)
1721{
1722	if (a->flags != b->flags)
1723		return 0;
1724	if (!mpol_store_user_nodemask(a))
1725		return 1;
1726	return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1727}
1728
1729/* Slow path of a mempolicy comparison */
1730int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1731{
1732	if (!a || !b)
1733		return 0;
1734	if (a->mode != b->mode)
1735		return 0;
1736	if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1737		return 0;
1738	switch (a->mode) {
1739	case MPOL_BIND:
1740		/* Fall through */
1741	case MPOL_INTERLEAVE:
1742		return nodes_equal(a->v.nodes, b->v.nodes);
1743	case MPOL_PREFERRED:
1744		return a->v.preferred_node == b->v.preferred_node &&
1745			a->flags == b->flags;
1746	default:
1747		BUG();
1748		return 0;
1749	}
1750}
1751
1752/*
1753 * Shared memory backing store policy support.
1754 *
1755 * Remember policies even when nobody has shared memory mapped.
1756 * The policies are kept in Red-Black tree linked from the inode.
1757 * They are protected by the sp->lock spinlock, which should be held
1758 * for any accesses to the tree.
1759 */
1760
1761/* lookup first element intersecting start-end */
1762/* Caller holds sp->lock */
1763static struct sp_node *
1764sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1765{
1766	struct rb_node *n = sp->root.rb_node;
1767
1768	while (n) {
1769		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1770
1771		if (start >= p->end)
1772			n = n->rb_right;
1773		else if (end <= p->start)
1774			n = n->rb_left;
1775		else
1776			break;
1777	}
1778	if (!n)
1779		return NULL;
1780	for (;;) {
1781		struct sp_node *w = NULL;
1782		struct rb_node *prev = rb_prev(n);
1783		if (!prev)
1784			break;
1785		w = rb_entry(prev, struct sp_node, nd);
1786		if (w->end <= start)
1787			break;
1788		n = prev;
1789	}
1790	return rb_entry(n, struct sp_node, nd);
1791}
1792
1793/* Insert a new shared policy into the list. */
1794/* Caller holds sp->lock */
1795static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1796{
1797	struct rb_node **p = &sp->root.rb_node;
1798	struct rb_node *parent = NULL;
1799	struct sp_node *nd;
1800
1801	while (*p) {
1802		parent = *p;
1803		nd = rb_entry(parent, struct sp_node, nd);
1804		if (new->start < nd->start)
1805			p = &(*p)->rb_left;
1806		else if (new->end > nd->end)
1807			p = &(*p)->rb_right;
1808		else
1809			BUG();
1810	}
1811	rb_link_node(&new->nd, parent, p);
1812	rb_insert_color(&new->nd, &sp->root);
1813	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1814		 new->policy ? new->policy->mode : 0);
1815}
1816
1817/* Find shared policy intersecting idx */
1818struct mempolicy *
1819mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1820{
1821	struct mempolicy *pol = NULL;
1822	struct sp_node *sn;
1823
1824	if (!sp->root.rb_node)
1825		return NULL;
1826	spin_lock(&sp->lock);
1827	sn = sp_lookup(sp, idx, idx+1);
1828	if (sn) {
1829		mpol_get(sn->policy);
1830		pol = sn->policy;
1831	}
1832	spin_unlock(&sp->lock);
1833	return pol;
1834}
1835
1836static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1837{
1838	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1839	rb_erase(&n->nd, &sp->root);
1840	mpol_put(n->policy);
1841	kmem_cache_free(sn_cache, n);
1842}
1843
1844static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1845				struct mempolicy *pol)
1846{
1847	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1848
1849	if (!n)
1850		return NULL;
1851	n->start = start;
1852	n->end = end;
1853	mpol_get(pol);
1854	pol->flags |= MPOL_F_SHARED;	/* for unref */
1855	n->policy = pol;
1856	return n;
1857}
1858
1859/* Replace a policy range. */
1860static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1861				 unsigned long end, struct sp_node *new)
1862{
1863	struct sp_node *n, *new2 = NULL;
1864
1865restart:
1866	spin_lock(&sp->lock);
1867	n = sp_lookup(sp, start, end);
1868	/* Take care of old policies in the same range. */
1869	while (n && n->start < end) {
1870		struct rb_node *next = rb_next(&n->nd);
1871		if (n->start >= start) {
1872			if (n->end <= end)
1873				sp_delete(sp, n);
1874			else
1875				n->start = end;
1876		} else {
1877			/* Old policy spanning whole new range. */
1878			if (n->end > end) {
1879				if (!new2) {
1880					spin_unlock(&sp->lock);
1881					new2 = sp_alloc(end, n->end, n->policy);
1882					if (!new2)
1883						return -ENOMEM;
1884					goto restart;
1885				}
1886				n->end = start;
1887				sp_insert(sp, new2);
1888				new2 = NULL;
1889				break;
1890			} else
1891				n->end = start;
1892		}
1893		if (!next)
1894			break;
1895		n = rb_entry(next, struct sp_node, nd);
1896	}
1897	if (new)
1898		sp_insert(sp, new);
1899	spin_unlock(&sp->lock);
1900	if (new2) {
1901		mpol_put(new2->policy);
1902		kmem_cache_free(sn_cache, new2);
1903	}
1904	return 0;
1905}
1906
1907/**
1908 * mpol_shared_policy_init - initialize shared policy for inode
1909 * @sp: pointer to inode shared policy
1910 * @mpol:  struct mempolicy to install
1911 *
1912 * Install non-NULL @mpol in inode's shared policy rb-tree.
1913 * On entry, the current task has a reference on a non-NULL @mpol.
1914 * This must be released on exit.
1915 * This is called at get_inode() calls and we can use GFP_KERNEL.
1916 */
1917void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1918{
1919	int ret;
1920
1921	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
1922	spin_lock_init(&sp->lock);
1923
1924	if (mpol) {
1925		struct vm_area_struct pvma;
1926		struct mempolicy *new;
1927		NODEMASK_SCRATCH(scratch);
1928
1929		if (!scratch)
1930			return;
1931		/* contextualize the tmpfs mount point mempolicy */
1932		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1933		if (IS_ERR(new)) {
1934			mpol_put(mpol);	/* drop our ref on sb mpol */
1935			NODEMASK_SCRATCH_FREE(scratch);
1936			return;		/* no valid nodemask intersection */
1937		}
1938
1939		task_lock(current);
1940		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
1941		task_unlock(current);
1942		mpol_put(mpol);	/* drop our ref on sb mpol */
1943		if (ret) {
1944			NODEMASK_SCRATCH_FREE(scratch);
1945			mpol_put(new);
1946			return;
1947		}
1948
1949		/* Create pseudo-vma that contains just the policy */
1950		memset(&pvma, 0, sizeof(struct vm_area_struct));
1951		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
1952		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
1953		mpol_put(new);			/* drop initial ref */
1954		NODEMASK_SCRATCH_FREE(scratch);
1955	}
1956}
1957
1958int mpol_set_shared_policy(struct shared_policy *info,
1959			struct vm_area_struct *vma, struct mempolicy *npol)
1960{
1961	int err;
1962	struct sp_node *new = NULL;
1963	unsigned long sz = vma_pages(vma);
1964
1965	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1966		 vma->vm_pgoff,
1967		 sz, npol ? npol->mode : -1,
1968		 npol ? npol->flags : -1,
1969		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1970
1971	if (npol) {
1972		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1973		if (!new)
1974			return -ENOMEM;
1975	}
1976	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1977	if (err && new)
1978		kmem_cache_free(sn_cache, new);
1979	return err;
1980}
1981
1982/* Free a backing policy store on inode delete. */
1983void mpol_free_shared_policy(struct shared_policy *p)
1984{
1985	struct sp_node *n;
1986	struct rb_node *next;
1987
1988	if (!p->root.rb_node)
1989		return;
1990	spin_lock(&p->lock);
1991	next = rb_first(&p->root);
1992	while (next) {
1993		n = rb_entry(next, struct sp_node, nd);
1994		next = rb_next(&n->nd);
1995		rb_erase(&n->nd, &p->root);
1996		mpol_put(n->policy);
1997		kmem_cache_free(sn_cache, n);
1998	}
1999	spin_unlock(&p->lock);
2000}
2001
2002/* assumes fs == KERNEL_DS */
2003void __init numa_policy_init(void)
2004{
2005	nodemask_t interleave_nodes;
2006	unsigned long largest = 0;
2007	int nid, prefer = 0;
2008
2009	policy_cache = kmem_cache_create("numa_policy",
2010					 sizeof(struct mempolicy),
2011					 0, SLAB_PANIC, NULL);
2012
2013	sn_cache = kmem_cache_create("shared_policy_node",
2014				     sizeof(struct sp_node),
2015				     0, SLAB_PANIC, NULL);
2016
2017	/*
2018	 * Set interleaving policy for system init. Interleaving is only
2019	 * enabled across suitably sized nodes (default is >= 16MB), or
2020	 * fall back to the largest node if they're all smaller.
2021	 */
2022	nodes_clear(interleave_nodes);
2023	for_each_node_state(nid, N_HIGH_MEMORY) {
2024		unsigned long total_pages = node_present_pages(nid);
2025
2026		/* Preserve the largest node */
2027		if (largest < total_pages) {
2028			largest = total_pages;
2029			prefer = nid;
2030		}
2031
2032		/* Interleave this node? */
2033		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2034			node_set(nid, interleave_nodes);
2035	}
2036
2037	/* All too small, use the largest */
2038	if (unlikely(nodes_empty(interleave_nodes)))
2039		node_set(prefer, interleave_nodes);
2040
2041	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2042		printk("numa_policy_init: interleaving failed\n");
2043}
2044
2045/* Reset policy of current process to default */
2046void numa_default_policy(void)
2047{
2048	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2049}
2050
2051/*
2052 * Parse and format mempolicy from/to strings
2053 */
2054
2055/*
2056 * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
2057 * Used only for mpol_parse_str() and mpol_to_str()
2058 */
2059#define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
2060static const char * const policy_types[] =
2061	{ "default", "prefer", "bind", "interleave", "local" };
2062
2063
2064#ifdef CONFIG_TMPFS
2065/**
2066 * mpol_parse_str - parse string to mempolicy
2067 * @str:  string containing mempolicy to parse
2068 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2069 * @no_context:  flag whether to "contextualize" the mempolicy
2070 *
2071 * Format of input:
2072 *	<mode>[=<flags>][:<nodelist>]
2073 *
2074 * if @no_context is true, save the input nodemask in w.user_nodemask in
2075 * the returned mempolicy.  This will be used to "clone" the mempolicy in
2076 * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
2077 * mount option.  Note that if 'static' or 'relative' mode flags were
2078 * specified, the input nodemask will already have been saved.  Saving
2079 * it again is redundant, but safe.
2080 *
2081 * On success, returns 0, else 1
2082 */
2083int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2084{
2085	struct mempolicy *new = NULL;
2086	unsigned short uninitialized_var(mode);
2087	unsigned short uninitialized_var(mode_flags);
2088	nodemask_t nodes;
2089	char *nodelist = strchr(str, ':');
2090	char *flags = strchr(str, '=');
2091	int i;
2092	int err = 1;
2093
2094	if (nodelist) {
2095		/* NUL-terminate mode or flags string */
2096		*nodelist++ = '\0';
2097		if (nodelist_parse(nodelist, nodes))
2098			goto out;
2099		if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2100			goto out;
2101	} else
2102		nodes_clear(nodes);
2103
2104	if (flags)
2105		*flags++ = '\0';	/* terminate mode string */
2106
2107	for (i = 0; i <= MPOL_LOCAL; i++) {
2108		if (!strcmp(str, policy_types[i])) {
2109			mode = i;
2110			break;
2111		}
2112	}
2113	if (i > MPOL_LOCAL)
2114		goto out;
2115
2116	switch (mode) {
2117	case MPOL_PREFERRED:
2118		/*
2119		 * Insist on a nodelist of one node only
2120		 */
2121		if (nodelist) {
2122			char *rest = nodelist;
2123			while (isdigit(*rest))
2124				rest++;
2125			if (!*rest)
2126				err = 0;
2127		}
2128		break;
2129	case MPOL_INTERLEAVE:
2130		/*
2131		 * Default to online nodes with memory if no nodelist
2132		 */
2133		if (!nodelist)
2134			nodes = node_states[N_HIGH_MEMORY];
2135		err = 0;
2136		break;
2137	case MPOL_LOCAL:
2138		/*
2139		 * Don't allow a nodelist;  mpol_new() checks flags
2140		 */
2141		if (nodelist)
2142			goto out;
2143		mode = MPOL_PREFERRED;
2144		break;
2145
2146	/*
2147	 * case MPOL_BIND:    mpol_new() enforces non-empty nodemask.
2148	 * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
2149	 */
2150	}
2151
2152	mode_flags = 0;
2153	if (flags) {
2154		/*
2155		 * Currently, we only support two mutually exclusive
2156		 * mode flags.
2157		 */
2158		if (!strcmp(flags, "static"))
2159			mode_flags |= MPOL_F_STATIC_NODES;
2160		else if (!strcmp(flags, "relative"))
2161			mode_flags |= MPOL_F_RELATIVE_NODES;
2162		else
2163			err = 1;
2164	}
2165
2166	new = mpol_new(mode, mode_flags, &nodes);
2167	if (IS_ERR(new))
2168		err = 1;
2169	else {
2170		int ret;
2171		NODEMASK_SCRATCH(scratch);
2172		if (scratch) {
2173			task_lock(current);
2174			ret = mpol_set_nodemask(new, &nodes, scratch);
2175			task_unlock(current);
2176		} else
2177			ret = -ENOMEM;
2178		NODEMASK_SCRATCH_FREE(scratch);
2179		if (ret) {
2180			err = 1;
2181			mpol_put(new);
2182		} else if (no_context) {
2183			/* save for contextualization */
2184			new->w.user_nodemask = nodes;
2185		}
2186	}
2187
2188out:
2189	/* Restore string for error message */
2190	if (nodelist)
2191		*--nodelist = ':';
2192	if (flags)
2193		*--flags = '=';
2194	if (!err)
2195		*mpol = new;
2196	return err;
2197}
2198#endif /* CONFIG_TMPFS */
2199
2200/**
2201 * mpol_to_str - format a mempolicy structure for printing
2202 * @buffer:  to contain formatted mempolicy string
2203 * @maxlen:  length of @buffer
2204 * @pol:  pointer to mempolicy to be formatted
2205 * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2206 *
2207 * Convert a mempolicy into a string.
2208 * Returns the number of characters in buffer (if positive)
2209 * or an error (negative)
2210 */
2211int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2212{
2213	char *p = buffer;
2214	int l;
2215	nodemask_t nodes;
2216	unsigned short mode;
2217	unsigned short flags = pol ? pol->flags : 0;
2218
2219	/*
2220	 * Sanity check:  room for longest mode, flag and some nodes
2221	 */
2222	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2223
2224	if (!pol || pol == &default_policy)
2225		mode = MPOL_DEFAULT;
2226	else
2227		mode = pol->mode;
2228
2229	switch (mode) {
2230	case MPOL_DEFAULT:
2231		nodes_clear(nodes);
2232		break;
2233
2234	case MPOL_PREFERRED:
2235		nodes_clear(nodes);
2236		if (flags & MPOL_F_LOCAL)
2237			mode = MPOL_LOCAL;	/* pseudo-policy */
2238		else
2239			node_set(pol->v.preferred_node, nodes);
2240		break;
2241
2242	case MPOL_BIND:
2243		/* Fall through */
2244	case MPOL_INTERLEAVE:
2245		if (no_context)
2246			nodes = pol->w.user_nodemask;
2247		else
2248			nodes = pol->v.nodes;
2249		break;
2250
2251	default:
2252		BUG();
2253	}
2254
2255	l = strlen(policy_types[mode]);
2256	if (buffer + maxlen < p + l + 1)
2257		return -ENOSPC;
2258
2259	strcpy(p, policy_types[mode]);
2260	p += l;
2261
2262	if (flags & MPOL_MODE_FLAGS) {
2263		if (buffer + maxlen < p + 2)
2264			return -ENOSPC;
2265		*p++ = '=';
2266
2267		/*
2268		 * Currently, the only defined flags are mutually exclusive
2269		 */
2270		if (flags & MPOL_F_STATIC_NODES)
2271			p += snprintf(p, buffer + maxlen - p, "static");
2272		else if (flags & MPOL_F_RELATIVE_NODES)
2273			p += snprintf(p, buffer + maxlen - p, "relative");
2274	}
2275
2276	if (!nodes_empty(nodes)) {
2277		if (buffer + maxlen < p + 2)
2278			return -ENOSPC;
2279		*p++ = ':';
2280	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2281	}
2282	return p - buffer;
2283}
2284
2285struct numa_maps {
2286	unsigned long pages;
2287	unsigned long anon;
2288	unsigned long active;
2289	unsigned long writeback;
2290	unsigned long mapcount_max;
2291	unsigned long dirty;
2292	unsigned long swapcache;
2293	unsigned long node[MAX_NUMNODES];
2294};
2295
2296static void gather_stats(struct page *page, void *private, int pte_dirty)
2297{
2298	struct numa_maps *md = private;
2299	int count = page_mapcount(page);
2300
2301	md->pages++;
2302	if (pte_dirty || PageDirty(page))
2303		md->dirty++;
2304
2305	if (PageSwapCache(page))
2306		md->swapcache++;
2307
2308	if (PageActive(page) || PageUnevictable(page))
2309		md->active++;
2310
2311	if (PageWriteback(page))
2312		md->writeback++;
2313
2314	if (PageAnon(page))
2315		md->anon++;
2316
2317	if (count > md->mapcount_max)
2318		md->mapcount_max = count;
2319
2320	md->node[page_to_nid(page)]++;
2321}
2322
2323#ifdef CONFIG_HUGETLB_PAGE
2324static void check_huge_range(struct vm_area_struct *vma,
2325		unsigned long start, unsigned long end,
2326		struct numa_maps *md)
2327{
2328	unsigned long addr;
2329	struct page *page;
2330	struct hstate *h = hstate_vma(vma);
2331	unsigned long sz = huge_page_size(h);
2332
2333	for (addr = start; addr < end; addr += sz) {
2334		pte_t *ptep = huge_pte_offset(vma->vm_mm,
2335						addr & huge_page_mask(h));
2336		pte_t pte;
2337
2338		if (!ptep)
2339			continue;
2340
2341		pte = *ptep;
2342		if (pte_none(pte))
2343			continue;
2344
2345		page = pte_page(pte);
2346		if (!page)
2347			continue;
2348
2349		gather_stats(page, md, pte_dirty(*ptep));
2350	}
2351}
2352#else
2353static inline void check_huge_range(struct vm_area_struct *vma,
2354		unsigned long start, unsigned long end,
2355		struct numa_maps *md)
2356{
2357}
2358#endif
2359
2360/*
2361 * Display pages allocated per node and memory policy via /proc.
2362 */
2363int show_numa_map(struct seq_file *m, void *v)
2364{
2365	struct proc_maps_private *priv = m->private;
2366	struct vm_area_struct *vma = v;
2367	struct numa_maps *md;
2368	struct file *file = vma->vm_file;
2369	struct mm_struct *mm = vma->vm_mm;
2370	struct mempolicy *pol;
2371	int n;
2372	char buffer[50];
2373
2374	if (!mm)
2375		return 0;
2376
2377	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2378	if (!md)
2379		return 0;
2380
2381	pol = get_vma_policy(priv->task, vma, vma->vm_start);
2382	mpol_to_str(buffer, sizeof(buffer), pol, 0);
2383	mpol_cond_put(pol);
2384
2385	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2386
2387	if (file) {
2388		seq_printf(m, " file=");
2389		seq_path(m, &file->f_path, "\n\t= ");
2390	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2391		seq_printf(m, " heap");
2392	} else if (vma->vm_start <= mm->start_stack &&
2393			vma->vm_end >= mm->start_stack) {
2394		seq_printf(m, " stack");
2395	}
2396
2397	if (is_vm_hugetlb_page(vma)) {
2398		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2399		seq_printf(m, " huge");
2400	} else {
2401		check_pgd_range(vma, vma->vm_start, vma->vm_end,
2402			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2403	}
2404
2405	if (!md->pages)
2406		goto out;
2407
2408	if (md->anon)
2409		seq_printf(m," anon=%lu",md->anon);
2410
2411	if (md->dirty)
2412		seq_printf(m," dirty=%lu",md->dirty);
2413
2414	if (md->pages != md->anon && md->pages != md->dirty)
2415		seq_printf(m, " mapped=%lu", md->pages);
2416
2417	if (md->mapcount_max > 1)
2418		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2419
2420	if (md->swapcache)
2421		seq_printf(m," swapcache=%lu", md->swapcache);
2422
2423	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2424		seq_printf(m," active=%lu", md->active);
2425
2426	if (md->writeback)
2427		seq_printf(m," writeback=%lu", md->writeback);
2428
2429	for_each_node_state(n, N_HIGH_MEMORY)
2430		if (md->node[n])
2431			seq_printf(m, " N%d=%lu", n, md->node[n]);
2432out:
2433	seq_putc(m, '\n');
2434	kfree(md);
2435
2436	if (m->count < m->size)
2437		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2438	return 0;
2439}
2440