mempolicy.c revision 0b14c179a483e71ea41df2aa4a661760063115bd
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave     Allocate memory interleaved over a set of nodes,
16 *                with normal fallback if it fails.
17 *                For VMA based allocations this interleaves based on the
18 *                offset into the backing object or offset into the mapping
19 *                for anonymous memory. For process policy an process counter
20 *                is used.
21 *
22 * bind           Only allocate memory on a specific set of nodes,
23 *                no fallback.
24 *                FIXME: memory is allocated starting with the first node
25 *                to the last. It would be better if bind would truly restrict
26 *                the allocation to memory nodes instead
27 *
28 * preferred       Try a specific node first before normal fallback.
29 *                As a special case node -1 here means do the allocation
30 *                on the local CPU. This is normally identical to default,
31 *                but useful to set in a VMA when you have a non default
32 *                process policy.
33 *
34 * default        Allocate on the local node first, or when on a VMA
35 *                use the process policy. This is what Linux always did
36 *		  in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57   fix mmap readahead to honour policy and enable policy for any page cache
58   object
59   statistics for bigpages
60   global policy for page cache? currently it uses process policy. Requires
61   first item above.
62   handle mremap for shared memory (currently ignored for the policy)
63   grows down?
64   make bind policy root only? It can trigger oom much faster and the
65   kernel is not always grateful with that.
66   could replace all the switch()es with a mempolicy_ops structure.
67*/
68
69#include <linux/mempolicy.h>
70#include <linux/mm.h>
71#include <linux/highmem.h>
72#include <linux/hugetlb.h>
73#include <linux/kernel.h>
74#include <linux/sched.h>
75#include <linux/mm.h>
76#include <linux/nodemask.h>
77#include <linux/cpuset.h>
78#include <linux/gfp.h>
79#include <linux/slab.h>
80#include <linux/string.h>
81#include <linux/module.h>
82#include <linux/interrupt.h>
83#include <linux/init.h>
84#include <linux/compat.h>
85#include <linux/mempolicy.h>
86#include <asm/tlbflush.h>
87#include <asm/uaccess.h>
88
89static kmem_cache_t *policy_cache;
90static kmem_cache_t *sn_cache;
91
92#define PDprintk(fmt...)
93
94/* Highest zone. An specific allocation for a zone below that is not
95   policied. */
96static int policy_zone;
97
98struct mempolicy default_policy = {
99	.refcnt = ATOMIC_INIT(1), /* never free it */
100	.policy = MPOL_DEFAULT,
101};
102
103/* Do sanity checking on a policy */
104static int mpol_check_policy(int mode, nodemask_t *nodes)
105{
106	int empty = nodes_empty(*nodes);
107
108	switch (mode) {
109	case MPOL_DEFAULT:
110		if (!empty)
111			return -EINVAL;
112		break;
113	case MPOL_BIND:
114	case MPOL_INTERLEAVE:
115		/* Preferred will only use the first bit, but allow
116		   more for now. */
117		if (empty)
118			return -EINVAL;
119		break;
120	}
121	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
122}
123/* Generate a custom zonelist for the BIND policy. */
124static struct zonelist *bind_zonelist(nodemask_t *nodes)
125{
126	struct zonelist *zl;
127	int num, max, nd;
128
129	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
130	zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
131	if (!zl)
132		return NULL;
133	num = 0;
134	for_each_node_mask(nd, *nodes) {
135		int k;
136		for (k = MAX_NR_ZONES-1; k >= 0; k--) {
137			struct zone *z = &NODE_DATA(nd)->node_zones[k];
138			if (!z->present_pages)
139				continue;
140			zl->zones[num++] = z;
141			if (k > policy_zone)
142				policy_zone = k;
143		}
144	}
145	zl->zones[num] = NULL;
146	return zl;
147}
148
149/* Create a new policy */
150static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
151{
152	struct mempolicy *policy;
153
154	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
155	if (mode == MPOL_DEFAULT)
156		return NULL;
157	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
158	if (!policy)
159		return ERR_PTR(-ENOMEM);
160	atomic_set(&policy->refcnt, 1);
161	switch (mode) {
162	case MPOL_INTERLEAVE:
163		policy->v.nodes = *nodes;
164		break;
165	case MPOL_PREFERRED:
166		policy->v.preferred_node = first_node(*nodes);
167		if (policy->v.preferred_node >= MAX_NUMNODES)
168			policy->v.preferred_node = -1;
169		break;
170	case MPOL_BIND:
171		policy->v.zonelist = bind_zonelist(nodes);
172		if (policy->v.zonelist == NULL) {
173			kmem_cache_free(policy_cache, policy);
174			return ERR_PTR(-ENOMEM);
175		}
176		break;
177	}
178	policy->policy = mode;
179	return policy;
180}
181
182/* Ensure all existing pages follow the policy. */
183static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
184		unsigned long addr, unsigned long end, nodemask_t *nodes)
185{
186	pte_t *orig_pte;
187	pte_t *pte;
188	spinlock_t *ptl;
189
190	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
191	do {
192		unsigned long pfn;
193		unsigned int nid;
194
195		if (!pte_present(*pte))
196			continue;
197		pfn = pte_pfn(*pte);
198		if (!pfn_valid(pfn)) {
199			print_bad_pte(vma, *pte, addr);
200			continue;
201		}
202		nid = pfn_to_nid(pfn);
203		if (!node_isset(nid, *nodes))
204			break;
205	} while (pte++, addr += PAGE_SIZE, addr != end);
206	pte_unmap_unlock(orig_pte, ptl);
207	return addr != end;
208}
209
210static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
211		unsigned long addr, unsigned long end, nodemask_t *nodes)
212{
213	pmd_t *pmd;
214	unsigned long next;
215
216	pmd = pmd_offset(pud, addr);
217	do {
218		next = pmd_addr_end(addr, end);
219		if (pmd_none_or_clear_bad(pmd))
220			continue;
221		if (check_pte_range(vma, pmd, addr, next, nodes))
222			return -EIO;
223	} while (pmd++, addr = next, addr != end);
224	return 0;
225}
226
227static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
228		unsigned long addr, unsigned long end, nodemask_t *nodes)
229{
230	pud_t *pud;
231	unsigned long next;
232
233	pud = pud_offset(pgd, addr);
234	do {
235		next = pud_addr_end(addr, end);
236		if (pud_none_or_clear_bad(pud))
237			continue;
238		if (check_pmd_range(vma, pud, addr, next, nodes))
239			return -EIO;
240	} while (pud++, addr = next, addr != end);
241	return 0;
242}
243
244static inline int check_pgd_range(struct vm_area_struct *vma,
245		unsigned long addr, unsigned long end, nodemask_t *nodes)
246{
247	pgd_t *pgd;
248	unsigned long next;
249
250	pgd = pgd_offset(vma->vm_mm, addr);
251	do {
252		next = pgd_addr_end(addr, end);
253		if (pgd_none_or_clear_bad(pgd))
254			continue;
255		if (check_pud_range(vma, pgd, addr, next, nodes))
256			return -EIO;
257	} while (pgd++, addr = next, addr != end);
258	return 0;
259}
260
261/* Step 1: check the range */
262static struct vm_area_struct *
263check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
264	    nodemask_t *nodes, unsigned long flags)
265{
266	int err;
267	struct vm_area_struct *first, *vma, *prev;
268
269	first = find_vma(mm, start);
270	if (!first)
271		return ERR_PTR(-EFAULT);
272	if (first->vm_flags & VM_UNPAGED)
273		return ERR_PTR(-EACCES);
274	prev = NULL;
275	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
276		if (!vma->vm_next && vma->vm_end < end)
277			return ERR_PTR(-EFAULT);
278		if (prev && prev->vm_end < vma->vm_start)
279			return ERR_PTR(-EFAULT);
280		if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
281			unsigned long endvma = vma->vm_end;
282			if (endvma > end)
283				endvma = end;
284			if (vma->vm_start > start)
285				start = vma->vm_start;
286			err = check_pgd_range(vma, start, endvma, nodes);
287			if (err) {
288				first = ERR_PTR(err);
289				break;
290			}
291		}
292		prev = vma;
293	}
294	return first;
295}
296
297/* Apply policy to a single VMA */
298static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
299{
300	int err = 0;
301	struct mempolicy *old = vma->vm_policy;
302
303	PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
304		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
305		 vma->vm_ops, vma->vm_file,
306		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
307
308	if (vma->vm_ops && vma->vm_ops->set_policy)
309		err = vma->vm_ops->set_policy(vma, new);
310	if (!err) {
311		mpol_get(new);
312		vma->vm_policy = new;
313		mpol_free(old);
314	}
315	return err;
316}
317
318/* Step 2: apply policy to a range and do splits. */
319static int mbind_range(struct vm_area_struct *vma, unsigned long start,
320		       unsigned long end, struct mempolicy *new)
321{
322	struct vm_area_struct *next;
323	int err;
324
325	err = 0;
326	for (; vma && vma->vm_start < end; vma = next) {
327		next = vma->vm_next;
328		if (vma->vm_start < start)
329			err = split_vma(vma->vm_mm, vma, start, 1);
330		if (!err && vma->vm_end > end)
331			err = split_vma(vma->vm_mm, vma, end, 0);
332		if (!err)
333			err = policy_vma(vma, new);
334		if (err)
335			break;
336	}
337	return err;
338}
339
340static int contextualize_policy(int mode, nodemask_t *nodes)
341{
342	if (!nodes)
343		return 0;
344
345	/* Update current mems_allowed */
346	cpuset_update_current_mems_allowed();
347	/* Ignore nodes not set in current->mems_allowed */
348	cpuset_restrict_to_mems_allowed(nodes->bits);
349	return mpol_check_policy(mode, nodes);
350}
351
352long do_mbind(unsigned long start, unsigned long len,
353		unsigned long mode, nodemask_t *nmask, unsigned long flags)
354{
355	struct vm_area_struct *vma;
356	struct mm_struct *mm = current->mm;
357	struct mempolicy *new;
358	unsigned long end;
359	int err;
360
361	if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
362		return -EINVAL;
363	if (start & ~PAGE_MASK)
364		return -EINVAL;
365	if (mode == MPOL_DEFAULT)
366		flags &= ~MPOL_MF_STRICT;
367	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
368	end = start + len;
369	if (end < start)
370		return -EINVAL;
371	if (end == start)
372		return 0;
373	if (mpol_check_policy(mode, nmask))
374		return -EINVAL;
375	new = mpol_new(mode, nmask);
376	if (IS_ERR(new))
377		return PTR_ERR(new);
378
379	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
380			mode,nodes_addr(nodes)[0]);
381
382	down_write(&mm->mmap_sem);
383	vma = check_range(mm, start, end, nmask, flags);
384	err = PTR_ERR(vma);
385	if (!IS_ERR(vma))
386		err = mbind_range(vma, start, end, new);
387	up_write(&mm->mmap_sem);
388	mpol_free(new);
389	return err;
390}
391
392/* Set the process memory policy */
393long do_set_mempolicy(int mode, nodemask_t *nodes)
394{
395	struct mempolicy *new;
396
397	if (contextualize_policy(mode, nodes))
398		return -EINVAL;
399	new = mpol_new(mode, nodes);
400	if (IS_ERR(new))
401		return PTR_ERR(new);
402	mpol_free(current->mempolicy);
403	current->mempolicy = new;
404	if (new && new->policy == MPOL_INTERLEAVE)
405		current->il_next = first_node(new->v.nodes);
406	return 0;
407}
408
409/* Fill a zone bitmap for a policy */
410static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
411{
412	int i;
413
414	nodes_clear(*nodes);
415	switch (p->policy) {
416	case MPOL_BIND:
417		for (i = 0; p->v.zonelist->zones[i]; i++)
418			node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
419				*nodes);
420		break;
421	case MPOL_DEFAULT:
422		break;
423	case MPOL_INTERLEAVE:
424		*nodes = p->v.nodes;
425		break;
426	case MPOL_PREFERRED:
427		/* or use current node instead of online map? */
428		if (p->v.preferred_node < 0)
429			*nodes = node_online_map;
430		else
431			node_set(p->v.preferred_node, *nodes);
432		break;
433	default:
434		BUG();
435	}
436}
437
438static int lookup_node(struct mm_struct *mm, unsigned long addr)
439{
440	struct page *p;
441	int err;
442
443	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
444	if (err >= 0) {
445		err = page_to_nid(p);
446		put_page(p);
447	}
448	return err;
449}
450
451/* Retrieve NUMA policy */
452long do_get_mempolicy(int *policy, nodemask_t *nmask,
453			unsigned long addr, unsigned long flags)
454{
455	int err;
456	struct mm_struct *mm = current->mm;
457	struct vm_area_struct *vma = NULL;
458	struct mempolicy *pol = current->mempolicy;
459
460	cpuset_update_current_mems_allowed();
461	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
462		return -EINVAL;
463	if (flags & MPOL_F_ADDR) {
464		down_read(&mm->mmap_sem);
465		vma = find_vma_intersection(mm, addr, addr+1);
466		if (!vma) {
467			up_read(&mm->mmap_sem);
468			return -EFAULT;
469		}
470		if (vma->vm_ops && vma->vm_ops->get_policy)
471			pol = vma->vm_ops->get_policy(vma, addr);
472		else
473			pol = vma->vm_policy;
474	} else if (addr)
475		return -EINVAL;
476
477	if (!pol)
478		pol = &default_policy;
479
480	if (flags & MPOL_F_NODE) {
481		if (flags & MPOL_F_ADDR) {
482			err = lookup_node(mm, addr);
483			if (err < 0)
484				goto out;
485			*policy = err;
486		} else if (pol == current->mempolicy &&
487				pol->policy == MPOL_INTERLEAVE) {
488			*policy = current->il_next;
489		} else {
490			err = -EINVAL;
491			goto out;
492		}
493	} else
494		*policy = pol->policy;
495
496	if (vma) {
497		up_read(&current->mm->mmap_sem);
498		vma = NULL;
499	}
500
501	err = 0;
502	if (nmask)
503		get_zonemask(pol, nmask);
504
505 out:
506	if (vma)
507		up_read(&current->mm->mmap_sem);
508	return err;
509}
510
511/*
512 * User space interface with variable sized bitmaps for nodelists.
513 */
514
515/* Copy a node mask from user space. */
516static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
517		     unsigned long maxnode)
518{
519	unsigned long k;
520	unsigned long nlongs;
521	unsigned long endmask;
522
523	--maxnode;
524	nodes_clear(*nodes);
525	if (maxnode == 0 || !nmask)
526		return 0;
527
528	nlongs = BITS_TO_LONGS(maxnode);
529	if ((maxnode % BITS_PER_LONG) == 0)
530		endmask = ~0UL;
531	else
532		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
533
534	/* When the user specified more nodes than supported just check
535	   if the non supported part is all zero. */
536	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
537		if (nlongs > PAGE_SIZE/sizeof(long))
538			return -EINVAL;
539		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
540			unsigned long t;
541			if (get_user(t, nmask + k))
542				return -EFAULT;
543			if (k == nlongs - 1) {
544				if (t & endmask)
545					return -EINVAL;
546			} else if (t)
547				return -EINVAL;
548		}
549		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
550		endmask = ~0UL;
551	}
552
553	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
554		return -EFAULT;
555	nodes_addr(*nodes)[nlongs-1] &= endmask;
556	return 0;
557}
558
559/* Copy a kernel node mask to user space */
560static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
561			      nodemask_t *nodes)
562{
563	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
564	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
565
566	if (copy > nbytes) {
567		if (copy > PAGE_SIZE)
568			return -EINVAL;
569		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
570			return -EFAULT;
571		copy = nbytes;
572	}
573	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
574}
575
576asmlinkage long sys_mbind(unsigned long start, unsigned long len,
577			unsigned long mode,
578			unsigned long __user *nmask, unsigned long maxnode,
579			unsigned flags)
580{
581	nodemask_t nodes;
582	int err;
583
584	err = get_nodes(&nodes, nmask, maxnode);
585	if (err)
586		return err;
587	return do_mbind(start, len, mode, &nodes, flags);
588}
589
590/* Set the process memory policy */
591asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
592		unsigned long maxnode)
593{
594	int err;
595	nodemask_t nodes;
596
597	if (mode < 0 || mode > MPOL_MAX)
598		return -EINVAL;
599	err = get_nodes(&nodes, nmask, maxnode);
600	if (err)
601		return err;
602	return do_set_mempolicy(mode, &nodes);
603}
604
605/* Retrieve NUMA policy */
606asmlinkage long sys_get_mempolicy(int __user *policy,
607				unsigned long __user *nmask,
608				unsigned long maxnode,
609				unsigned long addr, unsigned long flags)
610{
611	int err, pval;
612	nodemask_t nodes;
613
614	if (nmask != NULL && maxnode < MAX_NUMNODES)
615		return -EINVAL;
616
617	err = do_get_mempolicy(&pval, &nodes, addr, flags);
618
619	if (err)
620		return err;
621
622	if (policy && put_user(pval, policy))
623		return -EFAULT;
624
625	if (nmask)
626		err = copy_nodes_to_user(nmask, maxnode, &nodes);
627
628	return err;
629}
630
631#ifdef CONFIG_COMPAT
632
633asmlinkage long compat_sys_get_mempolicy(int __user *policy,
634				     compat_ulong_t __user *nmask,
635				     compat_ulong_t maxnode,
636				     compat_ulong_t addr, compat_ulong_t flags)
637{
638	long err;
639	unsigned long __user *nm = NULL;
640	unsigned long nr_bits, alloc_size;
641	DECLARE_BITMAP(bm, MAX_NUMNODES);
642
643	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
644	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
645
646	if (nmask)
647		nm = compat_alloc_user_space(alloc_size);
648
649	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
650
651	if (!err && nmask) {
652		err = copy_from_user(bm, nm, alloc_size);
653		/* ensure entire bitmap is zeroed */
654		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
655		err |= compat_put_bitmap(nmask, bm, nr_bits);
656	}
657
658	return err;
659}
660
661asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
662				     compat_ulong_t maxnode)
663{
664	long err = 0;
665	unsigned long __user *nm = NULL;
666	unsigned long nr_bits, alloc_size;
667	DECLARE_BITMAP(bm, MAX_NUMNODES);
668
669	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
670	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
671
672	if (nmask) {
673		err = compat_get_bitmap(bm, nmask, nr_bits);
674		nm = compat_alloc_user_space(alloc_size);
675		err |= copy_to_user(nm, bm, alloc_size);
676	}
677
678	if (err)
679		return -EFAULT;
680
681	return sys_set_mempolicy(mode, nm, nr_bits+1);
682}
683
684asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
685			     compat_ulong_t mode, compat_ulong_t __user *nmask,
686			     compat_ulong_t maxnode, compat_ulong_t flags)
687{
688	long err = 0;
689	unsigned long __user *nm = NULL;
690	unsigned long nr_bits, alloc_size;
691	nodemask_t bm;
692
693	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
694	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
695
696	if (nmask) {
697		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
698		nm = compat_alloc_user_space(alloc_size);
699		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
700	}
701
702	if (err)
703		return -EFAULT;
704
705	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
706}
707
708#endif
709
710/* Return effective policy for a VMA */
711struct mempolicy *
712get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
713{
714	struct mempolicy *pol = task->mempolicy;
715
716	if (vma) {
717		if (vma->vm_ops && vma->vm_ops->get_policy)
718			pol = vma->vm_ops->get_policy(vma, addr);
719		else if (vma->vm_policy &&
720				vma->vm_policy->policy != MPOL_DEFAULT)
721			pol = vma->vm_policy;
722	}
723	if (!pol)
724		pol = &default_policy;
725	return pol;
726}
727
728/* Return a zonelist representing a mempolicy */
729static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
730{
731	int nd;
732
733	switch (policy->policy) {
734	case MPOL_PREFERRED:
735		nd = policy->v.preferred_node;
736		if (nd < 0)
737			nd = numa_node_id();
738		break;
739	case MPOL_BIND:
740		/* Lower zones don't get a policy applied */
741		/* Careful: current->mems_allowed might have moved */
742		if (gfp_zone(gfp) >= policy_zone)
743			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
744				return policy->v.zonelist;
745		/*FALL THROUGH*/
746	case MPOL_INTERLEAVE: /* should not happen */
747	case MPOL_DEFAULT:
748		nd = numa_node_id();
749		break;
750	default:
751		nd = 0;
752		BUG();
753	}
754	return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
755}
756
757/* Do dynamic interleaving for a process */
758static unsigned interleave_nodes(struct mempolicy *policy)
759{
760	unsigned nid, next;
761	struct task_struct *me = current;
762
763	nid = me->il_next;
764	next = next_node(nid, policy->v.nodes);
765	if (next >= MAX_NUMNODES)
766		next = first_node(policy->v.nodes);
767	me->il_next = next;
768	return nid;
769}
770
771/* Do static interleaving for a VMA with known offset. */
772static unsigned offset_il_node(struct mempolicy *pol,
773		struct vm_area_struct *vma, unsigned long off)
774{
775	unsigned nnodes = nodes_weight(pol->v.nodes);
776	unsigned target = (unsigned)off % nnodes;
777	int c;
778	int nid = -1;
779
780	c = 0;
781	do {
782		nid = next_node(nid, pol->v.nodes);
783		c++;
784	} while (c <= target);
785	return nid;
786}
787
788/* Allocate a page in interleaved policy.
789   Own path because it needs to do special accounting. */
790static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
791					unsigned nid)
792{
793	struct zonelist *zl;
794	struct page *page;
795
796	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
797	page = __alloc_pages(gfp, order, zl);
798	if (page && page_zone(page) == zl->zones[0]) {
799		zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
800		put_cpu();
801	}
802	return page;
803}
804
805/**
806 * 	alloc_page_vma	- Allocate a page for a VMA.
807 *
808 * 	@gfp:
809 *      %GFP_USER    user allocation.
810 *      %GFP_KERNEL  kernel allocations,
811 *      %GFP_HIGHMEM highmem/user allocations,
812 *      %GFP_FS      allocation should not call back into a file system.
813 *      %GFP_ATOMIC  don't sleep.
814 *
815 * 	@vma:  Pointer to VMA or NULL if not available.
816 *	@addr: Virtual Address of the allocation. Must be inside the VMA.
817 *
818 * 	This function allocates a page from the kernel page pool and applies
819 *	a NUMA policy associated with the VMA or the current process.
820 *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
821 *	mm_struct of the VMA to prevent it from going away. Should be used for
822 *	all allocations for pages that will be mapped into
823 * 	user space. Returns NULL when no page can be allocated.
824 *
825 *	Should be called with the mm_sem of the vma hold.
826 */
827struct page *
828alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
829{
830	struct mempolicy *pol = get_vma_policy(current, vma, addr);
831
832	cpuset_update_current_mems_allowed();
833
834	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
835		unsigned nid;
836		if (vma) {
837			unsigned long off;
838			off = vma->vm_pgoff;
839			off += (addr - vma->vm_start) >> PAGE_SHIFT;
840			nid = offset_il_node(pol, vma, off);
841		} else {
842			/* fall back to process interleaving */
843			nid = interleave_nodes(pol);
844		}
845		return alloc_page_interleave(gfp, 0, nid);
846	}
847	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
848}
849
850/**
851 * 	alloc_pages_current - Allocate pages.
852 *
853 *	@gfp:
854 *		%GFP_USER   user allocation,
855 *      	%GFP_KERNEL kernel allocation,
856 *      	%GFP_HIGHMEM highmem allocation,
857 *      	%GFP_FS     don't call back into a file system.
858 *      	%GFP_ATOMIC don't sleep.
859 *	@order: Power of two of allocation size in pages. 0 is a single page.
860 *
861 *	Allocate a page from the kernel page pool.  When not in
862 *	interrupt context and apply the current process NUMA policy.
863 *	Returns NULL when no page can be allocated.
864 *
865 *	Don't call cpuset_update_current_mems_allowed() unless
866 *	1) it's ok to take cpuset_sem (can WAIT), and
867 *	2) allocating for current task (not interrupt).
868 */
869struct page *alloc_pages_current(gfp_t gfp, unsigned order)
870{
871	struct mempolicy *pol = current->mempolicy;
872
873	if ((gfp & __GFP_WAIT) && !in_interrupt())
874		cpuset_update_current_mems_allowed();
875	if (!pol || in_interrupt())
876		pol = &default_policy;
877	if (pol->policy == MPOL_INTERLEAVE)
878		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
879	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
880}
881EXPORT_SYMBOL(alloc_pages_current);
882
883/* Slow path of a mempolicy copy */
884struct mempolicy *__mpol_copy(struct mempolicy *old)
885{
886	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
887
888	if (!new)
889		return ERR_PTR(-ENOMEM);
890	*new = *old;
891	atomic_set(&new->refcnt, 1);
892	if (new->policy == MPOL_BIND) {
893		int sz = ksize(old->v.zonelist);
894		new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
895		if (!new->v.zonelist) {
896			kmem_cache_free(policy_cache, new);
897			return ERR_PTR(-ENOMEM);
898		}
899		memcpy(new->v.zonelist, old->v.zonelist, sz);
900	}
901	return new;
902}
903
904/* Slow path of a mempolicy comparison */
905int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
906{
907	if (!a || !b)
908		return 0;
909	if (a->policy != b->policy)
910		return 0;
911	switch (a->policy) {
912	case MPOL_DEFAULT:
913		return 1;
914	case MPOL_INTERLEAVE:
915		return nodes_equal(a->v.nodes, b->v.nodes);
916	case MPOL_PREFERRED:
917		return a->v.preferred_node == b->v.preferred_node;
918	case MPOL_BIND: {
919		int i;
920		for (i = 0; a->v.zonelist->zones[i]; i++)
921			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
922				return 0;
923		return b->v.zonelist->zones[i] == NULL;
924	}
925	default:
926		BUG();
927		return 0;
928	}
929}
930
931/* Slow path of a mpol destructor. */
932void __mpol_free(struct mempolicy *p)
933{
934	if (!atomic_dec_and_test(&p->refcnt))
935		return;
936	if (p->policy == MPOL_BIND)
937		kfree(p->v.zonelist);
938	p->policy = MPOL_DEFAULT;
939	kmem_cache_free(policy_cache, p);
940}
941
942/*
943 * Hugetlb policy. Same as above, just works with node numbers instead of
944 * zonelists.
945 */
946
947/* Find first node suitable for an allocation */
948int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
949{
950	struct mempolicy *pol = get_vma_policy(current, vma, addr);
951
952	switch (pol->policy) {
953	case MPOL_DEFAULT:
954		return numa_node_id();
955	case MPOL_BIND:
956		return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
957	case MPOL_INTERLEAVE:
958		return interleave_nodes(pol);
959	case MPOL_PREFERRED:
960		return pol->v.preferred_node >= 0 ?
961				pol->v.preferred_node : numa_node_id();
962	}
963	BUG();
964	return 0;
965}
966
967/* Find secondary valid nodes for an allocation */
968int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
969{
970	struct mempolicy *pol = get_vma_policy(current, vma, addr);
971
972	switch (pol->policy) {
973	case MPOL_PREFERRED:
974	case MPOL_DEFAULT:
975	case MPOL_INTERLEAVE:
976		return 1;
977	case MPOL_BIND: {
978		struct zone **z;
979		for (z = pol->v.zonelist->zones; *z; z++)
980			if ((*z)->zone_pgdat->node_id == nid)
981				return 1;
982		return 0;
983	}
984	default:
985		BUG();
986		return 0;
987	}
988}
989
990/*
991 * Shared memory backing store policy support.
992 *
993 * Remember policies even when nobody has shared memory mapped.
994 * The policies are kept in Red-Black tree linked from the inode.
995 * They are protected by the sp->lock spinlock, which should be held
996 * for any accesses to the tree.
997 */
998
999/* lookup first element intersecting start-end */
1000/* Caller holds sp->lock */
1001static struct sp_node *
1002sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1003{
1004	struct rb_node *n = sp->root.rb_node;
1005
1006	while (n) {
1007		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1008
1009		if (start >= p->end)
1010			n = n->rb_right;
1011		else if (end <= p->start)
1012			n = n->rb_left;
1013		else
1014			break;
1015	}
1016	if (!n)
1017		return NULL;
1018	for (;;) {
1019		struct sp_node *w = NULL;
1020		struct rb_node *prev = rb_prev(n);
1021		if (!prev)
1022			break;
1023		w = rb_entry(prev, struct sp_node, nd);
1024		if (w->end <= start)
1025			break;
1026		n = prev;
1027	}
1028	return rb_entry(n, struct sp_node, nd);
1029}
1030
1031/* Insert a new shared policy into the list. */
1032/* Caller holds sp->lock */
1033static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1034{
1035	struct rb_node **p = &sp->root.rb_node;
1036	struct rb_node *parent = NULL;
1037	struct sp_node *nd;
1038
1039	while (*p) {
1040		parent = *p;
1041		nd = rb_entry(parent, struct sp_node, nd);
1042		if (new->start < nd->start)
1043			p = &(*p)->rb_left;
1044		else if (new->end > nd->end)
1045			p = &(*p)->rb_right;
1046		else
1047			BUG();
1048	}
1049	rb_link_node(&new->nd, parent, p);
1050	rb_insert_color(&new->nd, &sp->root);
1051	PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1052		 new->policy ? new->policy->policy : 0);
1053}
1054
1055/* Find shared policy intersecting idx */
1056struct mempolicy *
1057mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1058{
1059	struct mempolicy *pol = NULL;
1060	struct sp_node *sn;
1061
1062	if (!sp->root.rb_node)
1063		return NULL;
1064	spin_lock(&sp->lock);
1065	sn = sp_lookup(sp, idx, idx+1);
1066	if (sn) {
1067		mpol_get(sn->policy);
1068		pol = sn->policy;
1069	}
1070	spin_unlock(&sp->lock);
1071	return pol;
1072}
1073
1074static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1075{
1076	PDprintk("deleting %lx-l%x\n", n->start, n->end);
1077	rb_erase(&n->nd, &sp->root);
1078	mpol_free(n->policy);
1079	kmem_cache_free(sn_cache, n);
1080}
1081
1082struct sp_node *
1083sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1084{
1085	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1086
1087	if (!n)
1088		return NULL;
1089	n->start = start;
1090	n->end = end;
1091	mpol_get(pol);
1092	n->policy = pol;
1093	return n;
1094}
1095
1096/* Replace a policy range. */
1097static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1098				 unsigned long end, struct sp_node *new)
1099{
1100	struct sp_node *n, *new2 = NULL;
1101
1102restart:
1103	spin_lock(&sp->lock);
1104	n = sp_lookup(sp, start, end);
1105	/* Take care of old policies in the same range. */
1106	while (n && n->start < end) {
1107		struct rb_node *next = rb_next(&n->nd);
1108		if (n->start >= start) {
1109			if (n->end <= end)
1110				sp_delete(sp, n);
1111			else
1112				n->start = end;
1113		} else {
1114			/* Old policy spanning whole new range. */
1115			if (n->end > end) {
1116				if (!new2) {
1117					spin_unlock(&sp->lock);
1118					new2 = sp_alloc(end, n->end, n->policy);
1119					if (!new2)
1120						return -ENOMEM;
1121					goto restart;
1122				}
1123				n->end = start;
1124				sp_insert(sp, new2);
1125				new2 = NULL;
1126				break;
1127			} else
1128				n->end = start;
1129		}
1130		if (!next)
1131			break;
1132		n = rb_entry(next, struct sp_node, nd);
1133	}
1134	if (new)
1135		sp_insert(sp, new);
1136	spin_unlock(&sp->lock);
1137	if (new2) {
1138		mpol_free(new2->policy);
1139		kmem_cache_free(sn_cache, new2);
1140	}
1141	return 0;
1142}
1143
1144int mpol_set_shared_policy(struct shared_policy *info,
1145			struct vm_area_struct *vma, struct mempolicy *npol)
1146{
1147	int err;
1148	struct sp_node *new = NULL;
1149	unsigned long sz = vma_pages(vma);
1150
1151	PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1152		 vma->vm_pgoff,
1153		 sz, npol? npol->policy : -1,
1154		npol ? nodes_addr(npol->v.nodes)[0] : -1);
1155
1156	if (npol) {
1157		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1158		if (!new)
1159			return -ENOMEM;
1160	}
1161	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1162	if (err && new)
1163		kmem_cache_free(sn_cache, new);
1164	return err;
1165}
1166
1167/* Free a backing policy store on inode delete. */
1168void mpol_free_shared_policy(struct shared_policy *p)
1169{
1170	struct sp_node *n;
1171	struct rb_node *next;
1172
1173	if (!p->root.rb_node)
1174		return;
1175	spin_lock(&p->lock);
1176	next = rb_first(&p->root);
1177	while (next) {
1178		n = rb_entry(next, struct sp_node, nd);
1179		next = rb_next(&n->nd);
1180		rb_erase(&n->nd, &p->root);
1181		mpol_free(n->policy);
1182		kmem_cache_free(sn_cache, n);
1183	}
1184	spin_unlock(&p->lock);
1185}
1186
1187/* assumes fs == KERNEL_DS */
1188void __init numa_policy_init(void)
1189{
1190	policy_cache = kmem_cache_create("numa_policy",
1191					 sizeof(struct mempolicy),
1192					 0, SLAB_PANIC, NULL, NULL);
1193
1194	sn_cache = kmem_cache_create("shared_policy_node",
1195				     sizeof(struct sp_node),
1196				     0, SLAB_PANIC, NULL, NULL);
1197
1198	/* Set interleaving policy for system init. This way not all
1199	   the data structures allocated at system boot end up in node zero. */
1200
1201	if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1202		printk("numa_policy_init: interleaving failed\n");
1203}
1204
1205/* Reset policy of current process to default */
1206void numa_default_policy(void)
1207{
1208	do_set_mempolicy(MPOL_DEFAULT, NULL);
1209}
1210
1211/* Migrate a policy to a different set of nodes */
1212static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1213							const nodemask_t *new)
1214{
1215	nodemask_t tmp;
1216
1217	if (!pol)
1218		return;
1219
1220	switch (pol->policy) {
1221	case MPOL_DEFAULT:
1222		break;
1223	case MPOL_INTERLEAVE:
1224		nodes_remap(tmp, pol->v.nodes, *old, *new);
1225		pol->v.nodes = tmp;
1226		current->il_next = node_remap(current->il_next, *old, *new);
1227		break;
1228	case MPOL_PREFERRED:
1229		pol->v.preferred_node = node_remap(pol->v.preferred_node,
1230								*old, *new);
1231		break;
1232	case MPOL_BIND: {
1233		nodemask_t nodes;
1234		struct zone **z;
1235		struct zonelist *zonelist;
1236
1237		nodes_clear(nodes);
1238		for (z = pol->v.zonelist->zones; *z; z++)
1239			node_set((*z)->zone_pgdat->node_id, nodes);
1240		nodes_remap(tmp, nodes, *old, *new);
1241		nodes = tmp;
1242
1243		zonelist = bind_zonelist(&nodes);
1244
1245		/* If no mem, then zonelist is NULL and we keep old zonelist.
1246		 * If that old zonelist has no remaining mems_allowed nodes,
1247		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1248		 */
1249
1250		if (zonelist) {
1251			/* Good - got mem - substitute new zonelist */
1252			kfree(pol->v.zonelist);
1253			pol->v.zonelist = zonelist;
1254		}
1255		break;
1256	}
1257	default:
1258		BUG();
1259		break;
1260	}
1261}
1262
1263/*
1264 * Someone moved this task to different nodes.  Fixup mempolicies.
1265 *
1266 * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1267 * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1268 */
1269void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1270{
1271	rebind_policy(current->mempolicy, old, new);
1272}
1273