mempolicy.c revision 15ad7cdcfd76450d4beebc789ec646664238184d
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave     Allocate memory interleaved over a set of nodes,
16 *                with normal fallback if it fails.
17 *                For VMA based allocations this interleaves based on the
18 *                offset into the backing object or offset into the mapping
19 *                for anonymous memory. For process policy an process counter
20 *                is used.
21 *
22 * bind           Only allocate memory on a specific set of nodes,
23 *                no fallback.
24 *                FIXME: memory is allocated starting with the first node
25 *                to the last. It would be better if bind would truly restrict
26 *                the allocation to memory nodes instead
27 *
28 * preferred       Try a specific node first before normal fallback.
29 *                As a special case node -1 here means do the allocation
30 *                on the local CPU. This is normally identical to default,
31 *                but useful to set in a VMA when you have a non default
32 *                process policy.
33 *
34 * default        Allocate on the local node first, or when on a VMA
35 *                use the process policy. This is what Linux always did
36 *		  in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57   fix mmap readahead to honour policy and enable policy for any page cache
58   object
59   statistics for bigpages
60   global policy for page cache? currently it uses process policy. Requires
61   first item above.
62   handle mremap for shared memory (currently ignored for the policy)
63   grows down?
64   make bind policy root only? It can trigger oom much faster and the
65   kernel is not always grateful with that.
66   could replace all the switch()es with a mempolicy_ops structure.
67*/
68
69#include <linux/mempolicy.h>
70#include <linux/mm.h>
71#include <linux/highmem.h>
72#include <linux/hugetlb.h>
73#include <linux/kernel.h>
74#include <linux/sched.h>
75#include <linux/mm.h>
76#include <linux/nodemask.h>
77#include <linux/cpuset.h>
78#include <linux/gfp.h>
79#include <linux/slab.h>
80#include <linux/string.h>
81#include <linux/module.h>
82#include <linux/interrupt.h>
83#include <linux/init.h>
84#include <linux/compat.h>
85#include <linux/mempolicy.h>
86#include <linux/swap.h>
87#include <linux/seq_file.h>
88#include <linux/proc_fs.h>
89#include <linux/migrate.h>
90#include <linux/rmap.h>
91#include <linux/security.h>
92
93#include <asm/tlbflush.h>
94#include <asm/uaccess.h>
95
96/* Internal flags */
97#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
98#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
99#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
100
101static struct kmem_cache *policy_cache;
102static struct kmem_cache *sn_cache;
103
104#define PDprintk(fmt...)
105
106/* Highest zone. An specific allocation for a zone below that is not
107   policied. */
108enum zone_type policy_zone = ZONE_DMA;
109
110struct mempolicy default_policy = {
111	.refcnt = ATOMIC_INIT(1), /* never free it */
112	.policy = MPOL_DEFAULT,
113};
114
115/* Do sanity checking on a policy */
116static int mpol_check_policy(int mode, nodemask_t *nodes)
117{
118	int empty = nodes_empty(*nodes);
119
120	switch (mode) {
121	case MPOL_DEFAULT:
122		if (!empty)
123			return -EINVAL;
124		break;
125	case MPOL_BIND:
126	case MPOL_INTERLEAVE:
127		/* Preferred will only use the first bit, but allow
128		   more for now. */
129		if (empty)
130			return -EINVAL;
131		break;
132	}
133	return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
134}
135
136/* Generate a custom zonelist for the BIND policy. */
137static struct zonelist *bind_zonelist(nodemask_t *nodes)
138{
139	struct zonelist *zl;
140	int num, max, nd;
141	enum zone_type k;
142
143	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
144	max++;			/* space for zlcache_ptr (see mmzone.h) */
145	zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
146	if (!zl)
147		return NULL;
148	zl->zlcache_ptr = NULL;
149	num = 0;
150	/* First put in the highest zones from all nodes, then all the next
151	   lower zones etc. Avoid empty zones because the memory allocator
152	   doesn't like them. If you implement node hot removal you
153	   have to fix that. */
154	k = policy_zone;
155	while (1) {
156		for_each_node_mask(nd, *nodes) {
157			struct zone *z = &NODE_DATA(nd)->node_zones[k];
158			if (z->present_pages > 0)
159				zl->zones[num++] = z;
160		}
161		if (k == 0)
162			break;
163		k--;
164	}
165	zl->zones[num] = NULL;
166	return zl;
167}
168
169/* Create a new policy */
170static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
171{
172	struct mempolicy *policy;
173
174	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
175	if (mode == MPOL_DEFAULT)
176		return NULL;
177	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
178	if (!policy)
179		return ERR_PTR(-ENOMEM);
180	atomic_set(&policy->refcnt, 1);
181	switch (mode) {
182	case MPOL_INTERLEAVE:
183		policy->v.nodes = *nodes;
184		if (nodes_weight(*nodes) == 0) {
185			kmem_cache_free(policy_cache, policy);
186			return ERR_PTR(-EINVAL);
187		}
188		break;
189	case MPOL_PREFERRED:
190		policy->v.preferred_node = first_node(*nodes);
191		if (policy->v.preferred_node >= MAX_NUMNODES)
192			policy->v.preferred_node = -1;
193		break;
194	case MPOL_BIND:
195		policy->v.zonelist = bind_zonelist(nodes);
196		if (policy->v.zonelist == NULL) {
197			kmem_cache_free(policy_cache, policy);
198			return ERR_PTR(-ENOMEM);
199		}
200		break;
201	}
202	policy->policy = mode;
203	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
204	return policy;
205}
206
207static void gather_stats(struct page *, void *, int pte_dirty);
208static void migrate_page_add(struct page *page, struct list_head *pagelist,
209				unsigned long flags);
210
211/* Scan through pages checking if pages follow certain conditions. */
212static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
213		unsigned long addr, unsigned long end,
214		const nodemask_t *nodes, unsigned long flags,
215		void *private)
216{
217	pte_t *orig_pte;
218	pte_t *pte;
219	spinlock_t *ptl;
220
221	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
222	do {
223		struct page *page;
224		int nid;
225
226		if (!pte_present(*pte))
227			continue;
228		page = vm_normal_page(vma, addr, *pte);
229		if (!page)
230			continue;
231		/*
232		 * The check for PageReserved here is important to avoid
233		 * handling zero pages and other pages that may have been
234		 * marked special by the system.
235		 *
236		 * If the PageReserved would not be checked here then f.e.
237		 * the location of the zero page could have an influence
238		 * on MPOL_MF_STRICT, zero pages would be counted for
239		 * the per node stats, and there would be useless attempts
240		 * to put zero pages on the migration list.
241		 */
242		if (PageReserved(page))
243			continue;
244		nid = page_to_nid(page);
245		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
246			continue;
247
248		if (flags & MPOL_MF_STATS)
249			gather_stats(page, private, pte_dirty(*pte));
250		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
251			migrate_page_add(page, private, flags);
252		else
253			break;
254	} while (pte++, addr += PAGE_SIZE, addr != end);
255	pte_unmap_unlock(orig_pte, ptl);
256	return addr != end;
257}
258
259static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
260		unsigned long addr, unsigned long end,
261		const nodemask_t *nodes, unsigned long flags,
262		void *private)
263{
264	pmd_t *pmd;
265	unsigned long next;
266
267	pmd = pmd_offset(pud, addr);
268	do {
269		next = pmd_addr_end(addr, end);
270		if (pmd_none_or_clear_bad(pmd))
271			continue;
272		if (check_pte_range(vma, pmd, addr, next, nodes,
273				    flags, private))
274			return -EIO;
275	} while (pmd++, addr = next, addr != end);
276	return 0;
277}
278
279static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
280		unsigned long addr, unsigned long end,
281		const nodemask_t *nodes, unsigned long flags,
282		void *private)
283{
284	pud_t *pud;
285	unsigned long next;
286
287	pud = pud_offset(pgd, addr);
288	do {
289		next = pud_addr_end(addr, end);
290		if (pud_none_or_clear_bad(pud))
291			continue;
292		if (check_pmd_range(vma, pud, addr, next, nodes,
293				    flags, private))
294			return -EIO;
295	} while (pud++, addr = next, addr != end);
296	return 0;
297}
298
299static inline int check_pgd_range(struct vm_area_struct *vma,
300		unsigned long addr, unsigned long end,
301		const nodemask_t *nodes, unsigned long flags,
302		void *private)
303{
304	pgd_t *pgd;
305	unsigned long next;
306
307	pgd = pgd_offset(vma->vm_mm, addr);
308	do {
309		next = pgd_addr_end(addr, end);
310		if (pgd_none_or_clear_bad(pgd))
311			continue;
312		if (check_pud_range(vma, pgd, addr, next, nodes,
313				    flags, private))
314			return -EIO;
315	} while (pgd++, addr = next, addr != end);
316	return 0;
317}
318
319/* Check if a vma is migratable */
320static inline int vma_migratable(struct vm_area_struct *vma)
321{
322	if (vma->vm_flags & (
323		VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
324		return 0;
325	return 1;
326}
327
328/*
329 * Check if all pages in a range are on a set of nodes.
330 * If pagelist != NULL then isolate pages from the LRU and
331 * put them on the pagelist.
332 */
333static struct vm_area_struct *
334check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
335		const nodemask_t *nodes, unsigned long flags, void *private)
336{
337	int err;
338	struct vm_area_struct *first, *vma, *prev;
339
340	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
341
342		err = migrate_prep();
343		if (err)
344			return ERR_PTR(err);
345	}
346
347	first = find_vma(mm, start);
348	if (!first)
349		return ERR_PTR(-EFAULT);
350	prev = NULL;
351	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
352		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
353			if (!vma->vm_next && vma->vm_end < end)
354				return ERR_PTR(-EFAULT);
355			if (prev && prev->vm_end < vma->vm_start)
356				return ERR_PTR(-EFAULT);
357		}
358		if (!is_vm_hugetlb_page(vma) &&
359		    ((flags & MPOL_MF_STRICT) ||
360		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
361				vma_migratable(vma)))) {
362			unsigned long endvma = vma->vm_end;
363
364			if (endvma > end)
365				endvma = end;
366			if (vma->vm_start > start)
367				start = vma->vm_start;
368			err = check_pgd_range(vma, start, endvma, nodes,
369						flags, private);
370			if (err) {
371				first = ERR_PTR(err);
372				break;
373			}
374		}
375		prev = vma;
376	}
377	return first;
378}
379
380/* Apply policy to a single VMA */
381static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
382{
383	int err = 0;
384	struct mempolicy *old = vma->vm_policy;
385
386	PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
387		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
388		 vma->vm_ops, vma->vm_file,
389		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
390
391	if (vma->vm_ops && vma->vm_ops->set_policy)
392		err = vma->vm_ops->set_policy(vma, new);
393	if (!err) {
394		mpol_get(new);
395		vma->vm_policy = new;
396		mpol_free(old);
397	}
398	return err;
399}
400
401/* Step 2: apply policy to a range and do splits. */
402static int mbind_range(struct vm_area_struct *vma, unsigned long start,
403		       unsigned long end, struct mempolicy *new)
404{
405	struct vm_area_struct *next;
406	int err;
407
408	err = 0;
409	for (; vma && vma->vm_start < end; vma = next) {
410		next = vma->vm_next;
411		if (vma->vm_start < start)
412			err = split_vma(vma->vm_mm, vma, start, 1);
413		if (!err && vma->vm_end > end)
414			err = split_vma(vma->vm_mm, vma, end, 0);
415		if (!err)
416			err = policy_vma(vma, new);
417		if (err)
418			break;
419	}
420	return err;
421}
422
423static int contextualize_policy(int mode, nodemask_t *nodes)
424{
425	if (!nodes)
426		return 0;
427
428	cpuset_update_task_memory_state();
429	if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
430		return -EINVAL;
431	return mpol_check_policy(mode, nodes);
432}
433
434
435/*
436 * Update task->flags PF_MEMPOLICY bit: set iff non-default
437 * mempolicy.  Allows more rapid checking of this (combined perhaps
438 * with other PF_* flag bits) on memory allocation hot code paths.
439 *
440 * If called from outside this file, the task 'p' should -only- be
441 * a newly forked child not yet visible on the task list, because
442 * manipulating the task flags of a visible task is not safe.
443 *
444 * The above limitation is why this routine has the funny name
445 * mpol_fix_fork_child_flag().
446 *
447 * It is also safe to call this with a task pointer of current,
448 * which the static wrapper mpol_set_task_struct_flag() does,
449 * for use within this file.
450 */
451
452void mpol_fix_fork_child_flag(struct task_struct *p)
453{
454	if (p->mempolicy)
455		p->flags |= PF_MEMPOLICY;
456	else
457		p->flags &= ~PF_MEMPOLICY;
458}
459
460static void mpol_set_task_struct_flag(void)
461{
462	mpol_fix_fork_child_flag(current);
463}
464
465/* Set the process memory policy */
466long do_set_mempolicy(int mode, nodemask_t *nodes)
467{
468	struct mempolicy *new;
469
470	if (contextualize_policy(mode, nodes))
471		return -EINVAL;
472	new = mpol_new(mode, nodes);
473	if (IS_ERR(new))
474		return PTR_ERR(new);
475	mpol_free(current->mempolicy);
476	current->mempolicy = new;
477	mpol_set_task_struct_flag();
478	if (new && new->policy == MPOL_INTERLEAVE)
479		current->il_next = first_node(new->v.nodes);
480	return 0;
481}
482
483/* Fill a zone bitmap for a policy */
484static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
485{
486	int i;
487
488	nodes_clear(*nodes);
489	switch (p->policy) {
490	case MPOL_BIND:
491		for (i = 0; p->v.zonelist->zones[i]; i++)
492			node_set(zone_to_nid(p->v.zonelist->zones[i]),
493				*nodes);
494		break;
495	case MPOL_DEFAULT:
496		break;
497	case MPOL_INTERLEAVE:
498		*nodes = p->v.nodes;
499		break;
500	case MPOL_PREFERRED:
501		/* or use current node instead of online map? */
502		if (p->v.preferred_node < 0)
503			*nodes = node_online_map;
504		else
505			node_set(p->v.preferred_node, *nodes);
506		break;
507	default:
508		BUG();
509	}
510}
511
512static int lookup_node(struct mm_struct *mm, unsigned long addr)
513{
514	struct page *p;
515	int err;
516
517	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
518	if (err >= 0) {
519		err = page_to_nid(p);
520		put_page(p);
521	}
522	return err;
523}
524
525/* Retrieve NUMA policy */
526long do_get_mempolicy(int *policy, nodemask_t *nmask,
527			unsigned long addr, unsigned long flags)
528{
529	int err;
530	struct mm_struct *mm = current->mm;
531	struct vm_area_struct *vma = NULL;
532	struct mempolicy *pol = current->mempolicy;
533
534	cpuset_update_task_memory_state();
535	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
536		return -EINVAL;
537	if (flags & MPOL_F_ADDR) {
538		down_read(&mm->mmap_sem);
539		vma = find_vma_intersection(mm, addr, addr+1);
540		if (!vma) {
541			up_read(&mm->mmap_sem);
542			return -EFAULT;
543		}
544		if (vma->vm_ops && vma->vm_ops->get_policy)
545			pol = vma->vm_ops->get_policy(vma, addr);
546		else
547			pol = vma->vm_policy;
548	} else if (addr)
549		return -EINVAL;
550
551	if (!pol)
552		pol = &default_policy;
553
554	if (flags & MPOL_F_NODE) {
555		if (flags & MPOL_F_ADDR) {
556			err = lookup_node(mm, addr);
557			if (err < 0)
558				goto out;
559			*policy = err;
560		} else if (pol == current->mempolicy &&
561				pol->policy == MPOL_INTERLEAVE) {
562			*policy = current->il_next;
563		} else {
564			err = -EINVAL;
565			goto out;
566		}
567	} else
568		*policy = pol->policy;
569
570	if (vma) {
571		up_read(&current->mm->mmap_sem);
572		vma = NULL;
573	}
574
575	err = 0;
576	if (nmask)
577		get_zonemask(pol, nmask);
578
579 out:
580	if (vma)
581		up_read(&current->mm->mmap_sem);
582	return err;
583}
584
585#ifdef CONFIG_MIGRATION
586/*
587 * page migration
588 */
589static void migrate_page_add(struct page *page, struct list_head *pagelist,
590				unsigned long flags)
591{
592	/*
593	 * Avoid migrating a page that is shared with others.
594	 */
595	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
596		isolate_lru_page(page, pagelist);
597}
598
599static struct page *new_node_page(struct page *page, unsigned long node, int **x)
600{
601	return alloc_pages_node(node, GFP_HIGHUSER, 0);
602}
603
604/*
605 * Migrate pages from one node to a target node.
606 * Returns error or the number of pages not migrated.
607 */
608int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
609{
610	nodemask_t nmask;
611	LIST_HEAD(pagelist);
612	int err = 0;
613
614	nodes_clear(nmask);
615	node_set(source, nmask);
616
617	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
618			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
619
620	if (!list_empty(&pagelist))
621		err = migrate_pages(&pagelist, new_node_page, dest);
622
623	return err;
624}
625
626/*
627 * Move pages between the two nodesets so as to preserve the physical
628 * layout as much as possible.
629 *
630 * Returns the number of page that could not be moved.
631 */
632int do_migrate_pages(struct mm_struct *mm,
633	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
634{
635	LIST_HEAD(pagelist);
636	int busy = 0;
637	int err = 0;
638	nodemask_t tmp;
639
640  	down_read(&mm->mmap_sem);
641
642	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
643	if (err)
644		goto out;
645
646/*
647 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
648 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
649 * bit in 'tmp', and return that <source, dest> pair for migration.
650 * The pair of nodemasks 'to' and 'from' define the map.
651 *
652 * If no pair of bits is found that way, fallback to picking some
653 * pair of 'source' and 'dest' bits that are not the same.  If the
654 * 'source' and 'dest' bits are the same, this represents a node
655 * that will be migrating to itself, so no pages need move.
656 *
657 * If no bits are left in 'tmp', or if all remaining bits left
658 * in 'tmp' correspond to the same bit in 'to', return false
659 * (nothing left to migrate).
660 *
661 * This lets us pick a pair of nodes to migrate between, such that
662 * if possible the dest node is not already occupied by some other
663 * source node, minimizing the risk of overloading the memory on a
664 * node that would happen if we migrated incoming memory to a node
665 * before migrating outgoing memory source that same node.
666 *
667 * A single scan of tmp is sufficient.  As we go, we remember the
668 * most recent <s, d> pair that moved (s != d).  If we find a pair
669 * that not only moved, but what's better, moved to an empty slot
670 * (d is not set in tmp), then we break out then, with that pair.
671 * Otherwise when we finish scannng from_tmp, we at least have the
672 * most recent <s, d> pair that moved.  If we get all the way through
673 * the scan of tmp without finding any node that moved, much less
674 * moved to an empty node, then there is nothing left worth migrating.
675 */
676
677	tmp = *from_nodes;
678	while (!nodes_empty(tmp)) {
679		int s,d;
680		int source = -1;
681		int dest = 0;
682
683		for_each_node_mask(s, tmp) {
684			d = node_remap(s, *from_nodes, *to_nodes);
685			if (s == d)
686				continue;
687
688			source = s;	/* Node moved. Memorize */
689			dest = d;
690
691			/* dest not in remaining from nodes? */
692			if (!node_isset(dest, tmp))
693				break;
694		}
695		if (source == -1)
696			break;
697
698		node_clear(source, tmp);
699		err = migrate_to_node(mm, source, dest, flags);
700		if (err > 0)
701			busy += err;
702		if (err < 0)
703			break;
704	}
705out:
706	up_read(&mm->mmap_sem);
707	if (err < 0)
708		return err;
709	return busy;
710
711}
712
713static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
714{
715	struct vm_area_struct *vma = (struct vm_area_struct *)private;
716
717	return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
718}
719#else
720
721static void migrate_page_add(struct page *page, struct list_head *pagelist,
722				unsigned long flags)
723{
724}
725
726int do_migrate_pages(struct mm_struct *mm,
727	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
728{
729	return -ENOSYS;
730}
731
732static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
733{
734	return NULL;
735}
736#endif
737
738long do_mbind(unsigned long start, unsigned long len,
739		unsigned long mode, nodemask_t *nmask, unsigned long flags)
740{
741	struct vm_area_struct *vma;
742	struct mm_struct *mm = current->mm;
743	struct mempolicy *new;
744	unsigned long end;
745	int err;
746	LIST_HEAD(pagelist);
747
748	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
749				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
750	    || mode > MPOL_MAX)
751		return -EINVAL;
752	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
753		return -EPERM;
754
755	if (start & ~PAGE_MASK)
756		return -EINVAL;
757
758	if (mode == MPOL_DEFAULT)
759		flags &= ~MPOL_MF_STRICT;
760
761	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
762	end = start + len;
763
764	if (end < start)
765		return -EINVAL;
766	if (end == start)
767		return 0;
768
769	if (mpol_check_policy(mode, nmask))
770		return -EINVAL;
771
772	new = mpol_new(mode, nmask);
773	if (IS_ERR(new))
774		return PTR_ERR(new);
775
776	/*
777	 * If we are using the default policy then operation
778	 * on discontinuous address spaces is okay after all
779	 */
780	if (!new)
781		flags |= MPOL_MF_DISCONTIG_OK;
782
783	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
784			mode,nodes_addr(nodes)[0]);
785
786	down_write(&mm->mmap_sem);
787	vma = check_range(mm, start, end, nmask,
788			  flags | MPOL_MF_INVERT, &pagelist);
789
790	err = PTR_ERR(vma);
791	if (!IS_ERR(vma)) {
792		int nr_failed = 0;
793
794		err = mbind_range(vma, start, end, new);
795
796		if (!list_empty(&pagelist))
797			nr_failed = migrate_pages(&pagelist, new_vma_page,
798						(unsigned long)vma);
799
800		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
801			err = -EIO;
802	}
803
804	up_write(&mm->mmap_sem);
805	mpol_free(new);
806	return err;
807}
808
809/*
810 * User space interface with variable sized bitmaps for nodelists.
811 */
812
813/* Copy a node mask from user space. */
814static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
815		     unsigned long maxnode)
816{
817	unsigned long k;
818	unsigned long nlongs;
819	unsigned long endmask;
820
821	--maxnode;
822	nodes_clear(*nodes);
823	if (maxnode == 0 || !nmask)
824		return 0;
825	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
826		return -EINVAL;
827
828	nlongs = BITS_TO_LONGS(maxnode);
829	if ((maxnode % BITS_PER_LONG) == 0)
830		endmask = ~0UL;
831	else
832		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
833
834	/* When the user specified more nodes than supported just check
835	   if the non supported part is all zero. */
836	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
837		if (nlongs > PAGE_SIZE/sizeof(long))
838			return -EINVAL;
839		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
840			unsigned long t;
841			if (get_user(t, nmask + k))
842				return -EFAULT;
843			if (k == nlongs - 1) {
844				if (t & endmask)
845					return -EINVAL;
846			} else if (t)
847				return -EINVAL;
848		}
849		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
850		endmask = ~0UL;
851	}
852
853	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
854		return -EFAULT;
855	nodes_addr(*nodes)[nlongs-1] &= endmask;
856	return 0;
857}
858
859/* Copy a kernel node mask to user space */
860static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
861			      nodemask_t *nodes)
862{
863	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
864	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
865
866	if (copy > nbytes) {
867		if (copy > PAGE_SIZE)
868			return -EINVAL;
869		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
870			return -EFAULT;
871		copy = nbytes;
872	}
873	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
874}
875
876asmlinkage long sys_mbind(unsigned long start, unsigned long len,
877			unsigned long mode,
878			unsigned long __user *nmask, unsigned long maxnode,
879			unsigned flags)
880{
881	nodemask_t nodes;
882	int err;
883
884	err = get_nodes(&nodes, nmask, maxnode);
885	if (err)
886		return err;
887	return do_mbind(start, len, mode, &nodes, flags);
888}
889
890/* Set the process memory policy */
891asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
892		unsigned long maxnode)
893{
894	int err;
895	nodemask_t nodes;
896
897	if (mode < 0 || mode > MPOL_MAX)
898		return -EINVAL;
899	err = get_nodes(&nodes, nmask, maxnode);
900	if (err)
901		return err;
902	return do_set_mempolicy(mode, &nodes);
903}
904
905asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
906		const unsigned long __user *old_nodes,
907		const unsigned long __user *new_nodes)
908{
909	struct mm_struct *mm;
910	struct task_struct *task;
911	nodemask_t old;
912	nodemask_t new;
913	nodemask_t task_nodes;
914	int err;
915
916	err = get_nodes(&old, old_nodes, maxnode);
917	if (err)
918		return err;
919
920	err = get_nodes(&new, new_nodes, maxnode);
921	if (err)
922		return err;
923
924	/* Find the mm_struct */
925	read_lock(&tasklist_lock);
926	task = pid ? find_task_by_pid(pid) : current;
927	if (!task) {
928		read_unlock(&tasklist_lock);
929		return -ESRCH;
930	}
931	mm = get_task_mm(task);
932	read_unlock(&tasklist_lock);
933
934	if (!mm)
935		return -EINVAL;
936
937	/*
938	 * Check if this process has the right to modify the specified
939	 * process. The right exists if the process has administrative
940	 * capabilities, superuser privileges or the same
941	 * userid as the target process.
942	 */
943	if ((current->euid != task->suid) && (current->euid != task->uid) &&
944	    (current->uid != task->suid) && (current->uid != task->uid) &&
945	    !capable(CAP_SYS_NICE)) {
946		err = -EPERM;
947		goto out;
948	}
949
950	task_nodes = cpuset_mems_allowed(task);
951	/* Is the user allowed to access the target nodes? */
952	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
953		err = -EPERM;
954		goto out;
955	}
956
957	err = security_task_movememory(task);
958	if (err)
959		goto out;
960
961	err = do_migrate_pages(mm, &old, &new,
962		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
963out:
964	mmput(mm);
965	return err;
966}
967
968
969/* Retrieve NUMA policy */
970asmlinkage long sys_get_mempolicy(int __user *policy,
971				unsigned long __user *nmask,
972				unsigned long maxnode,
973				unsigned long addr, unsigned long flags)
974{
975	int err, pval;
976	nodemask_t nodes;
977
978	if (nmask != NULL && maxnode < MAX_NUMNODES)
979		return -EINVAL;
980
981	err = do_get_mempolicy(&pval, &nodes, addr, flags);
982
983	if (err)
984		return err;
985
986	if (policy && put_user(pval, policy))
987		return -EFAULT;
988
989	if (nmask)
990		err = copy_nodes_to_user(nmask, maxnode, &nodes);
991
992	return err;
993}
994
995#ifdef CONFIG_COMPAT
996
997asmlinkage long compat_sys_get_mempolicy(int __user *policy,
998				     compat_ulong_t __user *nmask,
999				     compat_ulong_t maxnode,
1000				     compat_ulong_t addr, compat_ulong_t flags)
1001{
1002	long err;
1003	unsigned long __user *nm = NULL;
1004	unsigned long nr_bits, alloc_size;
1005	DECLARE_BITMAP(bm, MAX_NUMNODES);
1006
1007	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1008	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1009
1010	if (nmask)
1011		nm = compat_alloc_user_space(alloc_size);
1012
1013	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1014
1015	if (!err && nmask) {
1016		err = copy_from_user(bm, nm, alloc_size);
1017		/* ensure entire bitmap is zeroed */
1018		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1019		err |= compat_put_bitmap(nmask, bm, nr_bits);
1020	}
1021
1022	return err;
1023}
1024
1025asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1026				     compat_ulong_t maxnode)
1027{
1028	long err = 0;
1029	unsigned long __user *nm = NULL;
1030	unsigned long nr_bits, alloc_size;
1031	DECLARE_BITMAP(bm, MAX_NUMNODES);
1032
1033	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1034	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1035
1036	if (nmask) {
1037		err = compat_get_bitmap(bm, nmask, nr_bits);
1038		nm = compat_alloc_user_space(alloc_size);
1039		err |= copy_to_user(nm, bm, alloc_size);
1040	}
1041
1042	if (err)
1043		return -EFAULT;
1044
1045	return sys_set_mempolicy(mode, nm, nr_bits+1);
1046}
1047
1048asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1049			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1050			     compat_ulong_t maxnode, compat_ulong_t flags)
1051{
1052	long err = 0;
1053	unsigned long __user *nm = NULL;
1054	unsigned long nr_bits, alloc_size;
1055	nodemask_t bm;
1056
1057	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1058	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1059
1060	if (nmask) {
1061		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1062		nm = compat_alloc_user_space(alloc_size);
1063		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1064	}
1065
1066	if (err)
1067		return -EFAULT;
1068
1069	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1070}
1071
1072#endif
1073
1074/* Return effective policy for a VMA */
1075static struct mempolicy * get_vma_policy(struct task_struct *task,
1076		struct vm_area_struct *vma, unsigned long addr)
1077{
1078	struct mempolicy *pol = task->mempolicy;
1079
1080	if (vma) {
1081		if (vma->vm_ops && vma->vm_ops->get_policy)
1082			pol = vma->vm_ops->get_policy(vma, addr);
1083		else if (vma->vm_policy &&
1084				vma->vm_policy->policy != MPOL_DEFAULT)
1085			pol = vma->vm_policy;
1086	}
1087	if (!pol)
1088		pol = &default_policy;
1089	return pol;
1090}
1091
1092/* Return a zonelist representing a mempolicy */
1093static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1094{
1095	int nd;
1096
1097	switch (policy->policy) {
1098	case MPOL_PREFERRED:
1099		nd = policy->v.preferred_node;
1100		if (nd < 0)
1101			nd = numa_node_id();
1102		break;
1103	case MPOL_BIND:
1104		/* Lower zones don't get a policy applied */
1105		/* Careful: current->mems_allowed might have moved */
1106		if (gfp_zone(gfp) >= policy_zone)
1107			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1108				return policy->v.zonelist;
1109		/*FALL THROUGH*/
1110	case MPOL_INTERLEAVE: /* should not happen */
1111	case MPOL_DEFAULT:
1112		nd = numa_node_id();
1113		break;
1114	default:
1115		nd = 0;
1116		BUG();
1117	}
1118	return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1119}
1120
1121/* Do dynamic interleaving for a process */
1122static unsigned interleave_nodes(struct mempolicy *policy)
1123{
1124	unsigned nid, next;
1125	struct task_struct *me = current;
1126
1127	nid = me->il_next;
1128	next = next_node(nid, policy->v.nodes);
1129	if (next >= MAX_NUMNODES)
1130		next = first_node(policy->v.nodes);
1131	me->il_next = next;
1132	return nid;
1133}
1134
1135/*
1136 * Depending on the memory policy provide a node from which to allocate the
1137 * next slab entry.
1138 */
1139unsigned slab_node(struct mempolicy *policy)
1140{
1141	int pol = policy ? policy->policy : MPOL_DEFAULT;
1142
1143	switch (pol) {
1144	case MPOL_INTERLEAVE:
1145		return interleave_nodes(policy);
1146
1147	case MPOL_BIND:
1148		/*
1149		 * Follow bind policy behavior and start allocation at the
1150		 * first node.
1151		 */
1152		return zone_to_nid(policy->v.zonelist->zones[0]);
1153
1154	case MPOL_PREFERRED:
1155		if (policy->v.preferred_node >= 0)
1156			return policy->v.preferred_node;
1157		/* Fall through */
1158
1159	default:
1160		return numa_node_id();
1161	}
1162}
1163
1164/* Do static interleaving for a VMA with known offset. */
1165static unsigned offset_il_node(struct mempolicy *pol,
1166		struct vm_area_struct *vma, unsigned long off)
1167{
1168	unsigned nnodes = nodes_weight(pol->v.nodes);
1169	unsigned target = (unsigned)off % nnodes;
1170	int c;
1171	int nid = -1;
1172
1173	c = 0;
1174	do {
1175		nid = next_node(nid, pol->v.nodes);
1176		c++;
1177	} while (c <= target);
1178	return nid;
1179}
1180
1181/* Determine a node number for interleave */
1182static inline unsigned interleave_nid(struct mempolicy *pol,
1183		 struct vm_area_struct *vma, unsigned long addr, int shift)
1184{
1185	if (vma) {
1186		unsigned long off;
1187
1188		/*
1189		 * for small pages, there is no difference between
1190		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1191		 * for huge pages, since vm_pgoff is in units of small
1192		 * pages, we need to shift off the always 0 bits to get
1193		 * a useful offset.
1194		 */
1195		BUG_ON(shift < PAGE_SHIFT);
1196		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1197		off += (addr - vma->vm_start) >> shift;
1198		return offset_il_node(pol, vma, off);
1199	} else
1200		return interleave_nodes(pol);
1201}
1202
1203#ifdef CONFIG_HUGETLBFS
1204/* Return a zonelist suitable for a huge page allocation. */
1205struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1206{
1207	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1208
1209	if (pol->policy == MPOL_INTERLEAVE) {
1210		unsigned nid;
1211
1212		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1213		return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1214	}
1215	return zonelist_policy(GFP_HIGHUSER, pol);
1216}
1217#endif
1218
1219/* Allocate a page in interleaved policy.
1220   Own path because it needs to do special accounting. */
1221static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1222					unsigned nid)
1223{
1224	struct zonelist *zl;
1225	struct page *page;
1226
1227	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1228	page = __alloc_pages(gfp, order, zl);
1229	if (page && page_zone(page) == zl->zones[0])
1230		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1231	return page;
1232}
1233
1234/**
1235 * 	alloc_page_vma	- Allocate a page for a VMA.
1236 *
1237 * 	@gfp:
1238 *      %GFP_USER    user allocation.
1239 *      %GFP_KERNEL  kernel allocations,
1240 *      %GFP_HIGHMEM highmem/user allocations,
1241 *      %GFP_FS      allocation should not call back into a file system.
1242 *      %GFP_ATOMIC  don't sleep.
1243 *
1244 * 	@vma:  Pointer to VMA or NULL if not available.
1245 *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1246 *
1247 * 	This function allocates a page from the kernel page pool and applies
1248 *	a NUMA policy associated with the VMA or the current process.
1249 *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1250 *	mm_struct of the VMA to prevent it from going away. Should be used for
1251 *	all allocations for pages that will be mapped into
1252 * 	user space. Returns NULL when no page can be allocated.
1253 *
1254 *	Should be called with the mm_sem of the vma hold.
1255 */
1256struct page *
1257alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1258{
1259	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1260
1261	cpuset_update_task_memory_state();
1262
1263	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1264		unsigned nid;
1265
1266		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1267		return alloc_page_interleave(gfp, 0, nid);
1268	}
1269	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1270}
1271
1272/**
1273 * 	alloc_pages_current - Allocate pages.
1274 *
1275 *	@gfp:
1276 *		%GFP_USER   user allocation,
1277 *      	%GFP_KERNEL kernel allocation,
1278 *      	%GFP_HIGHMEM highmem allocation,
1279 *      	%GFP_FS     don't call back into a file system.
1280 *      	%GFP_ATOMIC don't sleep.
1281 *	@order: Power of two of allocation size in pages. 0 is a single page.
1282 *
1283 *	Allocate a page from the kernel page pool.  When not in
1284 *	interrupt context and apply the current process NUMA policy.
1285 *	Returns NULL when no page can be allocated.
1286 *
1287 *	Don't call cpuset_update_task_memory_state() unless
1288 *	1) it's ok to take cpuset_sem (can WAIT), and
1289 *	2) allocating for current task (not interrupt).
1290 */
1291struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1292{
1293	struct mempolicy *pol = current->mempolicy;
1294
1295	if ((gfp & __GFP_WAIT) && !in_interrupt())
1296		cpuset_update_task_memory_state();
1297	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1298		pol = &default_policy;
1299	if (pol->policy == MPOL_INTERLEAVE)
1300		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1301	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1302}
1303EXPORT_SYMBOL(alloc_pages_current);
1304
1305/*
1306 * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1307 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1308 * with the mems_allowed returned by cpuset_mems_allowed().  This
1309 * keeps mempolicies cpuset relative after its cpuset moves.  See
1310 * further kernel/cpuset.c update_nodemask().
1311 */
1312void *cpuset_being_rebound;
1313
1314/* Slow path of a mempolicy copy */
1315struct mempolicy *__mpol_copy(struct mempolicy *old)
1316{
1317	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1318
1319	if (!new)
1320		return ERR_PTR(-ENOMEM);
1321	if (current_cpuset_is_being_rebound()) {
1322		nodemask_t mems = cpuset_mems_allowed(current);
1323		mpol_rebind_policy(old, &mems);
1324	}
1325	*new = *old;
1326	atomic_set(&new->refcnt, 1);
1327	if (new->policy == MPOL_BIND) {
1328		int sz = ksize(old->v.zonelist);
1329		new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1330		if (!new->v.zonelist) {
1331			kmem_cache_free(policy_cache, new);
1332			return ERR_PTR(-ENOMEM);
1333		}
1334	}
1335	return new;
1336}
1337
1338/* Slow path of a mempolicy comparison */
1339int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1340{
1341	if (!a || !b)
1342		return 0;
1343	if (a->policy != b->policy)
1344		return 0;
1345	switch (a->policy) {
1346	case MPOL_DEFAULT:
1347		return 1;
1348	case MPOL_INTERLEAVE:
1349		return nodes_equal(a->v.nodes, b->v.nodes);
1350	case MPOL_PREFERRED:
1351		return a->v.preferred_node == b->v.preferred_node;
1352	case MPOL_BIND: {
1353		int i;
1354		for (i = 0; a->v.zonelist->zones[i]; i++)
1355			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1356				return 0;
1357		return b->v.zonelist->zones[i] == NULL;
1358	}
1359	default:
1360		BUG();
1361		return 0;
1362	}
1363}
1364
1365/* Slow path of a mpol destructor. */
1366void __mpol_free(struct mempolicy *p)
1367{
1368	if (!atomic_dec_and_test(&p->refcnt))
1369		return;
1370	if (p->policy == MPOL_BIND)
1371		kfree(p->v.zonelist);
1372	p->policy = MPOL_DEFAULT;
1373	kmem_cache_free(policy_cache, p);
1374}
1375
1376/*
1377 * Shared memory backing store policy support.
1378 *
1379 * Remember policies even when nobody has shared memory mapped.
1380 * The policies are kept in Red-Black tree linked from the inode.
1381 * They are protected by the sp->lock spinlock, which should be held
1382 * for any accesses to the tree.
1383 */
1384
1385/* lookup first element intersecting start-end */
1386/* Caller holds sp->lock */
1387static struct sp_node *
1388sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1389{
1390	struct rb_node *n = sp->root.rb_node;
1391
1392	while (n) {
1393		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1394
1395		if (start >= p->end)
1396			n = n->rb_right;
1397		else if (end <= p->start)
1398			n = n->rb_left;
1399		else
1400			break;
1401	}
1402	if (!n)
1403		return NULL;
1404	for (;;) {
1405		struct sp_node *w = NULL;
1406		struct rb_node *prev = rb_prev(n);
1407		if (!prev)
1408			break;
1409		w = rb_entry(prev, struct sp_node, nd);
1410		if (w->end <= start)
1411			break;
1412		n = prev;
1413	}
1414	return rb_entry(n, struct sp_node, nd);
1415}
1416
1417/* Insert a new shared policy into the list. */
1418/* Caller holds sp->lock */
1419static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1420{
1421	struct rb_node **p = &sp->root.rb_node;
1422	struct rb_node *parent = NULL;
1423	struct sp_node *nd;
1424
1425	while (*p) {
1426		parent = *p;
1427		nd = rb_entry(parent, struct sp_node, nd);
1428		if (new->start < nd->start)
1429			p = &(*p)->rb_left;
1430		else if (new->end > nd->end)
1431			p = &(*p)->rb_right;
1432		else
1433			BUG();
1434	}
1435	rb_link_node(&new->nd, parent, p);
1436	rb_insert_color(&new->nd, &sp->root);
1437	PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1438		 new->policy ? new->policy->policy : 0);
1439}
1440
1441/* Find shared policy intersecting idx */
1442struct mempolicy *
1443mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1444{
1445	struct mempolicy *pol = NULL;
1446	struct sp_node *sn;
1447
1448	if (!sp->root.rb_node)
1449		return NULL;
1450	spin_lock(&sp->lock);
1451	sn = sp_lookup(sp, idx, idx+1);
1452	if (sn) {
1453		mpol_get(sn->policy);
1454		pol = sn->policy;
1455	}
1456	spin_unlock(&sp->lock);
1457	return pol;
1458}
1459
1460static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1461{
1462	PDprintk("deleting %lx-l%x\n", n->start, n->end);
1463	rb_erase(&n->nd, &sp->root);
1464	mpol_free(n->policy);
1465	kmem_cache_free(sn_cache, n);
1466}
1467
1468struct sp_node *
1469sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1470{
1471	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1472
1473	if (!n)
1474		return NULL;
1475	n->start = start;
1476	n->end = end;
1477	mpol_get(pol);
1478	n->policy = pol;
1479	return n;
1480}
1481
1482/* Replace a policy range. */
1483static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1484				 unsigned long end, struct sp_node *new)
1485{
1486	struct sp_node *n, *new2 = NULL;
1487
1488restart:
1489	spin_lock(&sp->lock);
1490	n = sp_lookup(sp, start, end);
1491	/* Take care of old policies in the same range. */
1492	while (n && n->start < end) {
1493		struct rb_node *next = rb_next(&n->nd);
1494		if (n->start >= start) {
1495			if (n->end <= end)
1496				sp_delete(sp, n);
1497			else
1498				n->start = end;
1499		} else {
1500			/* Old policy spanning whole new range. */
1501			if (n->end > end) {
1502				if (!new2) {
1503					spin_unlock(&sp->lock);
1504					new2 = sp_alloc(end, n->end, n->policy);
1505					if (!new2)
1506						return -ENOMEM;
1507					goto restart;
1508				}
1509				n->end = start;
1510				sp_insert(sp, new2);
1511				new2 = NULL;
1512				break;
1513			} else
1514				n->end = start;
1515		}
1516		if (!next)
1517			break;
1518		n = rb_entry(next, struct sp_node, nd);
1519	}
1520	if (new)
1521		sp_insert(sp, new);
1522	spin_unlock(&sp->lock);
1523	if (new2) {
1524		mpol_free(new2->policy);
1525		kmem_cache_free(sn_cache, new2);
1526	}
1527	return 0;
1528}
1529
1530void mpol_shared_policy_init(struct shared_policy *info, int policy,
1531				nodemask_t *policy_nodes)
1532{
1533	info->root = RB_ROOT;
1534	spin_lock_init(&info->lock);
1535
1536	if (policy != MPOL_DEFAULT) {
1537		struct mempolicy *newpol;
1538
1539		/* Falls back to MPOL_DEFAULT on any error */
1540		newpol = mpol_new(policy, policy_nodes);
1541		if (!IS_ERR(newpol)) {
1542			/* Create pseudo-vma that contains just the policy */
1543			struct vm_area_struct pvma;
1544
1545			memset(&pvma, 0, sizeof(struct vm_area_struct));
1546			/* Policy covers entire file */
1547			pvma.vm_end = TASK_SIZE;
1548			mpol_set_shared_policy(info, &pvma, newpol);
1549			mpol_free(newpol);
1550		}
1551	}
1552}
1553
1554int mpol_set_shared_policy(struct shared_policy *info,
1555			struct vm_area_struct *vma, struct mempolicy *npol)
1556{
1557	int err;
1558	struct sp_node *new = NULL;
1559	unsigned long sz = vma_pages(vma);
1560
1561	PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1562		 vma->vm_pgoff,
1563		 sz, npol? npol->policy : -1,
1564		npol ? nodes_addr(npol->v.nodes)[0] : -1);
1565
1566	if (npol) {
1567		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1568		if (!new)
1569			return -ENOMEM;
1570	}
1571	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1572	if (err && new)
1573		kmem_cache_free(sn_cache, new);
1574	return err;
1575}
1576
1577/* Free a backing policy store on inode delete. */
1578void mpol_free_shared_policy(struct shared_policy *p)
1579{
1580	struct sp_node *n;
1581	struct rb_node *next;
1582
1583	if (!p->root.rb_node)
1584		return;
1585	spin_lock(&p->lock);
1586	next = rb_first(&p->root);
1587	while (next) {
1588		n = rb_entry(next, struct sp_node, nd);
1589		next = rb_next(&n->nd);
1590		rb_erase(&n->nd, &p->root);
1591		mpol_free(n->policy);
1592		kmem_cache_free(sn_cache, n);
1593	}
1594	spin_unlock(&p->lock);
1595}
1596
1597/* assumes fs == KERNEL_DS */
1598void __init numa_policy_init(void)
1599{
1600	policy_cache = kmem_cache_create("numa_policy",
1601					 sizeof(struct mempolicy),
1602					 0, SLAB_PANIC, NULL, NULL);
1603
1604	sn_cache = kmem_cache_create("shared_policy_node",
1605				     sizeof(struct sp_node),
1606				     0, SLAB_PANIC, NULL, NULL);
1607
1608	/* Set interleaving policy for system init. This way not all
1609	   the data structures allocated at system boot end up in node zero. */
1610
1611	if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1612		printk("numa_policy_init: interleaving failed\n");
1613}
1614
1615/* Reset policy of current process to default */
1616void numa_default_policy(void)
1617{
1618	do_set_mempolicy(MPOL_DEFAULT, NULL);
1619}
1620
1621/* Migrate a policy to a different set of nodes */
1622void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1623{
1624	nodemask_t *mpolmask;
1625	nodemask_t tmp;
1626
1627	if (!pol)
1628		return;
1629	mpolmask = &pol->cpuset_mems_allowed;
1630	if (nodes_equal(*mpolmask, *newmask))
1631		return;
1632
1633	switch (pol->policy) {
1634	case MPOL_DEFAULT:
1635		break;
1636	case MPOL_INTERLEAVE:
1637		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1638		pol->v.nodes = tmp;
1639		*mpolmask = *newmask;
1640		current->il_next = node_remap(current->il_next,
1641						*mpolmask, *newmask);
1642		break;
1643	case MPOL_PREFERRED:
1644		pol->v.preferred_node = node_remap(pol->v.preferred_node,
1645						*mpolmask, *newmask);
1646		*mpolmask = *newmask;
1647		break;
1648	case MPOL_BIND: {
1649		nodemask_t nodes;
1650		struct zone **z;
1651		struct zonelist *zonelist;
1652
1653		nodes_clear(nodes);
1654		for (z = pol->v.zonelist->zones; *z; z++)
1655			node_set(zone_to_nid(*z), nodes);
1656		nodes_remap(tmp, nodes, *mpolmask, *newmask);
1657		nodes = tmp;
1658
1659		zonelist = bind_zonelist(&nodes);
1660
1661		/* If no mem, then zonelist is NULL and we keep old zonelist.
1662		 * If that old zonelist has no remaining mems_allowed nodes,
1663		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1664		 */
1665
1666		if (zonelist) {
1667			/* Good - got mem - substitute new zonelist */
1668			kfree(pol->v.zonelist);
1669			pol->v.zonelist = zonelist;
1670		}
1671		*mpolmask = *newmask;
1672		break;
1673	}
1674	default:
1675		BUG();
1676		break;
1677	}
1678}
1679
1680/*
1681 * Wrapper for mpol_rebind_policy() that just requires task
1682 * pointer, and updates task mempolicy.
1683 */
1684
1685void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1686{
1687	mpol_rebind_policy(tsk->mempolicy, new);
1688}
1689
1690/*
1691 * Rebind each vma in mm to new nodemask.
1692 *
1693 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1694 */
1695
1696void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1697{
1698	struct vm_area_struct *vma;
1699
1700	down_write(&mm->mmap_sem);
1701	for (vma = mm->mmap; vma; vma = vma->vm_next)
1702		mpol_rebind_policy(vma->vm_policy, new);
1703	up_write(&mm->mmap_sem);
1704}
1705
1706/*
1707 * Display pages allocated per node and memory policy via /proc.
1708 */
1709
1710static const char * const policy_types[] =
1711	{ "default", "prefer", "bind", "interleave" };
1712
1713/*
1714 * Convert a mempolicy into a string.
1715 * Returns the number of characters in buffer (if positive)
1716 * or an error (negative)
1717 */
1718static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1719{
1720	char *p = buffer;
1721	int l;
1722	nodemask_t nodes;
1723	int mode = pol ? pol->policy : MPOL_DEFAULT;
1724
1725	switch (mode) {
1726	case MPOL_DEFAULT:
1727		nodes_clear(nodes);
1728		break;
1729
1730	case MPOL_PREFERRED:
1731		nodes_clear(nodes);
1732		node_set(pol->v.preferred_node, nodes);
1733		break;
1734
1735	case MPOL_BIND:
1736		get_zonemask(pol, &nodes);
1737		break;
1738
1739	case MPOL_INTERLEAVE:
1740		nodes = pol->v.nodes;
1741		break;
1742
1743	default:
1744		BUG();
1745		return -EFAULT;
1746	}
1747
1748	l = strlen(policy_types[mode]);
1749 	if (buffer + maxlen < p + l + 1)
1750 		return -ENOSPC;
1751
1752	strcpy(p, policy_types[mode]);
1753	p += l;
1754
1755	if (!nodes_empty(nodes)) {
1756		if (buffer + maxlen < p + 2)
1757			return -ENOSPC;
1758		*p++ = '=';
1759	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1760	}
1761	return p - buffer;
1762}
1763
1764struct numa_maps {
1765	unsigned long pages;
1766	unsigned long anon;
1767	unsigned long active;
1768	unsigned long writeback;
1769	unsigned long mapcount_max;
1770	unsigned long dirty;
1771	unsigned long swapcache;
1772	unsigned long node[MAX_NUMNODES];
1773};
1774
1775static void gather_stats(struct page *page, void *private, int pte_dirty)
1776{
1777	struct numa_maps *md = private;
1778	int count = page_mapcount(page);
1779
1780	md->pages++;
1781	if (pte_dirty || PageDirty(page))
1782		md->dirty++;
1783
1784	if (PageSwapCache(page))
1785		md->swapcache++;
1786
1787	if (PageActive(page))
1788		md->active++;
1789
1790	if (PageWriteback(page))
1791		md->writeback++;
1792
1793	if (PageAnon(page))
1794		md->anon++;
1795
1796	if (count > md->mapcount_max)
1797		md->mapcount_max = count;
1798
1799	md->node[page_to_nid(page)]++;
1800}
1801
1802#ifdef CONFIG_HUGETLB_PAGE
1803static void check_huge_range(struct vm_area_struct *vma,
1804		unsigned long start, unsigned long end,
1805		struct numa_maps *md)
1806{
1807	unsigned long addr;
1808	struct page *page;
1809
1810	for (addr = start; addr < end; addr += HPAGE_SIZE) {
1811		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1812		pte_t pte;
1813
1814		if (!ptep)
1815			continue;
1816
1817		pte = *ptep;
1818		if (pte_none(pte))
1819			continue;
1820
1821		page = pte_page(pte);
1822		if (!page)
1823			continue;
1824
1825		gather_stats(page, md, pte_dirty(*ptep));
1826	}
1827}
1828#else
1829static inline void check_huge_range(struct vm_area_struct *vma,
1830		unsigned long start, unsigned long end,
1831		struct numa_maps *md)
1832{
1833}
1834#endif
1835
1836int show_numa_map(struct seq_file *m, void *v)
1837{
1838	struct proc_maps_private *priv = m->private;
1839	struct vm_area_struct *vma = v;
1840	struct numa_maps *md;
1841	struct file *file = vma->vm_file;
1842	struct mm_struct *mm = vma->vm_mm;
1843	int n;
1844	char buffer[50];
1845
1846	if (!mm)
1847		return 0;
1848
1849	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1850	if (!md)
1851		return 0;
1852
1853	mpol_to_str(buffer, sizeof(buffer),
1854			    get_vma_policy(priv->task, vma, vma->vm_start));
1855
1856	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1857
1858	if (file) {
1859		seq_printf(m, " file=");
1860		seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
1861	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1862		seq_printf(m, " heap");
1863	} else if (vma->vm_start <= mm->start_stack &&
1864			vma->vm_end >= mm->start_stack) {
1865		seq_printf(m, " stack");
1866	}
1867
1868	if (is_vm_hugetlb_page(vma)) {
1869		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1870		seq_printf(m, " huge");
1871	} else {
1872		check_pgd_range(vma, vma->vm_start, vma->vm_end,
1873				&node_online_map, MPOL_MF_STATS, md);
1874	}
1875
1876	if (!md->pages)
1877		goto out;
1878
1879	if (md->anon)
1880		seq_printf(m," anon=%lu",md->anon);
1881
1882	if (md->dirty)
1883		seq_printf(m," dirty=%lu",md->dirty);
1884
1885	if (md->pages != md->anon && md->pages != md->dirty)
1886		seq_printf(m, " mapped=%lu", md->pages);
1887
1888	if (md->mapcount_max > 1)
1889		seq_printf(m, " mapmax=%lu", md->mapcount_max);
1890
1891	if (md->swapcache)
1892		seq_printf(m," swapcache=%lu", md->swapcache);
1893
1894	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1895		seq_printf(m," active=%lu", md->active);
1896
1897	if (md->writeback)
1898		seq_printf(m," writeback=%lu", md->writeback);
1899
1900	for_each_online_node(n)
1901		if (md->node[n])
1902			seq_printf(m, " N%d=%lu", n, md->node[n]);
1903out:
1904	seq_putc(m, '\n');
1905	kfree(md);
1906
1907	if (m->count < m->size)
1908		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
1909	return 0;
1910}
1911
1912