mempolicy.c revision 19770b32609b6bf97a3dece2529089494cbfc549
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave     Allocate memory interleaved over a set of nodes,
16 *                with normal fallback if it fails.
17 *                For VMA based allocations this interleaves based on the
18 *                offset into the backing object or offset into the mapping
19 *                for anonymous memory. For process policy an process counter
20 *                is used.
21 *
22 * bind           Only allocate memory on a specific set of nodes,
23 *                no fallback.
24 *                FIXME: memory is allocated starting with the first node
25 *                to the last. It would be better if bind would truly restrict
26 *                the allocation to memory nodes instead
27 *
28 * preferred       Try a specific node first before normal fallback.
29 *                As a special case node -1 here means do the allocation
30 *                on the local CPU. This is normally identical to default,
31 *                but useful to set in a VMA when you have a non default
32 *                process policy.
33 *
34 * default        Allocate on the local node first, or when on a VMA
35 *                use the process policy. This is what Linux always did
36 *		  in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57   fix mmap readahead to honour policy and enable policy for any page cache
58   object
59   statistics for bigpages
60   global policy for page cache? currently it uses process policy. Requires
61   first item above.
62   handle mremap for shared memory (currently ignored for the policy)
63   grows down?
64   make bind policy root only? It can trigger oom much faster and the
65   kernel is not always grateful with that.
66   could replace all the switch()es with a mempolicy_ops structure.
67*/
68
69#include <linux/mempolicy.h>
70#include <linux/mm.h>
71#include <linux/highmem.h>
72#include <linux/hugetlb.h>
73#include <linux/kernel.h>
74#include <linux/sched.h>
75#include <linux/nodemask.h>
76#include <linux/cpuset.h>
77#include <linux/gfp.h>
78#include <linux/slab.h>
79#include <linux/string.h>
80#include <linux/module.h>
81#include <linux/nsproxy.h>
82#include <linux/interrupt.h>
83#include <linux/init.h>
84#include <linux/compat.h>
85#include <linux/swap.h>
86#include <linux/seq_file.h>
87#include <linux/proc_fs.h>
88#include <linux/migrate.h>
89#include <linux/rmap.h>
90#include <linux/security.h>
91#include <linux/syscalls.h>
92
93#include <asm/tlbflush.h>
94#include <asm/uaccess.h>
95
96/* Internal flags */
97#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
98#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
99#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
100
101static struct kmem_cache *policy_cache;
102static struct kmem_cache *sn_cache;
103
104/* Highest zone. An specific allocation for a zone below that is not
105   policied. */
106enum zone_type policy_zone = 0;
107
108struct mempolicy default_policy = {
109	.refcnt = ATOMIC_INIT(1), /* never free it */
110	.policy = MPOL_DEFAULT,
111};
112
113static void mpol_rebind_policy(struct mempolicy *pol,
114                               const nodemask_t *newmask);
115
116/* Do sanity checking on a policy */
117static int mpol_check_policy(int mode, nodemask_t *nodes)
118{
119	int was_empty, is_empty;
120
121	if (!nodes)
122		return 0;
123
124	/*
125	 * "Contextualize" the in-coming nodemast for cpusets:
126	 * Remember whether in-coming nodemask was empty,  If not,
127	 * restrict the nodes to the allowed nodes in the cpuset.
128	 * This is guaranteed to be a subset of nodes with memory.
129	 */
130	cpuset_update_task_memory_state();
131	is_empty = was_empty = nodes_empty(*nodes);
132	if (!was_empty) {
133		nodes_and(*nodes, *nodes, cpuset_current_mems_allowed);
134		is_empty = nodes_empty(*nodes);	/* after "contextualization" */
135	}
136
137	switch (mode) {
138	case MPOL_DEFAULT:
139		/*
140		 * require caller to specify an empty nodemask
141		 * before "contextualization"
142		 */
143		if (!was_empty)
144			return -EINVAL;
145		break;
146	case MPOL_BIND:
147	case MPOL_INTERLEAVE:
148		/*
149		 * require at least 1 valid node after "contextualization"
150		 */
151		if (is_empty)
152			return -EINVAL;
153		break;
154	case MPOL_PREFERRED:
155		/*
156		 * Did caller specify invalid nodes?
157		 * Don't silently accept this as "local allocation".
158		 */
159		if (!was_empty && is_empty)
160			return -EINVAL;
161		break;
162	}
163	return 0;
164}
165
166/* Check that the nodemask contains at least one populated zone */
167static int is_valid_nodemask(nodemask_t *nodemask)
168{
169	int nd, k;
170
171	/* Check that there is something useful in this mask */
172	k = policy_zone;
173
174	for_each_node_mask(nd, *nodemask) {
175		struct zone *z;
176
177		for (k = 0; k <= policy_zone; k++) {
178			z = &NODE_DATA(nd)->node_zones[k];
179			if (z->present_pages > 0)
180				return 1;
181		}
182	}
183
184	return 0;
185}
186
187/* Create a new policy */
188static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
189{
190	struct mempolicy *policy;
191
192	pr_debug("setting mode %d nodes[0] %lx\n",
193		 mode, nodes ? nodes_addr(*nodes)[0] : -1);
194
195	if (mode == MPOL_DEFAULT)
196		return NULL;
197	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
198	if (!policy)
199		return ERR_PTR(-ENOMEM);
200	atomic_set(&policy->refcnt, 1);
201	switch (mode) {
202	case MPOL_INTERLEAVE:
203		policy->v.nodes = *nodes;
204		if (nodes_weight(policy->v.nodes) == 0) {
205			kmem_cache_free(policy_cache, policy);
206			return ERR_PTR(-EINVAL);
207		}
208		break;
209	case MPOL_PREFERRED:
210		policy->v.preferred_node = first_node(*nodes);
211		if (policy->v.preferred_node >= MAX_NUMNODES)
212			policy->v.preferred_node = -1;
213		break;
214	case MPOL_BIND:
215		if (!is_valid_nodemask(nodes)) {
216			kmem_cache_free(policy_cache, policy);
217			return ERR_PTR(-EINVAL);
218		}
219		policy->v.nodes = *nodes;
220		break;
221	}
222	policy->policy = mode;
223	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
224	return policy;
225}
226
227static void gather_stats(struct page *, void *, int pte_dirty);
228static void migrate_page_add(struct page *page, struct list_head *pagelist,
229				unsigned long flags);
230
231/* Scan through pages checking if pages follow certain conditions. */
232static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
233		unsigned long addr, unsigned long end,
234		const nodemask_t *nodes, unsigned long flags,
235		void *private)
236{
237	pte_t *orig_pte;
238	pte_t *pte;
239	spinlock_t *ptl;
240
241	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
242	do {
243		struct page *page;
244		int nid;
245
246		if (!pte_present(*pte))
247			continue;
248		page = vm_normal_page(vma, addr, *pte);
249		if (!page)
250			continue;
251		/*
252		 * The check for PageReserved here is important to avoid
253		 * handling zero pages and other pages that may have been
254		 * marked special by the system.
255		 *
256		 * If the PageReserved would not be checked here then f.e.
257		 * the location of the zero page could have an influence
258		 * on MPOL_MF_STRICT, zero pages would be counted for
259		 * the per node stats, and there would be useless attempts
260		 * to put zero pages on the migration list.
261		 */
262		if (PageReserved(page))
263			continue;
264		nid = page_to_nid(page);
265		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
266			continue;
267
268		if (flags & MPOL_MF_STATS)
269			gather_stats(page, private, pte_dirty(*pte));
270		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
271			migrate_page_add(page, private, flags);
272		else
273			break;
274	} while (pte++, addr += PAGE_SIZE, addr != end);
275	pte_unmap_unlock(orig_pte, ptl);
276	return addr != end;
277}
278
279static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
280		unsigned long addr, unsigned long end,
281		const nodemask_t *nodes, unsigned long flags,
282		void *private)
283{
284	pmd_t *pmd;
285	unsigned long next;
286
287	pmd = pmd_offset(pud, addr);
288	do {
289		next = pmd_addr_end(addr, end);
290		if (pmd_none_or_clear_bad(pmd))
291			continue;
292		if (check_pte_range(vma, pmd, addr, next, nodes,
293				    flags, private))
294			return -EIO;
295	} while (pmd++, addr = next, addr != end);
296	return 0;
297}
298
299static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
300		unsigned long addr, unsigned long end,
301		const nodemask_t *nodes, unsigned long flags,
302		void *private)
303{
304	pud_t *pud;
305	unsigned long next;
306
307	pud = pud_offset(pgd, addr);
308	do {
309		next = pud_addr_end(addr, end);
310		if (pud_none_or_clear_bad(pud))
311			continue;
312		if (check_pmd_range(vma, pud, addr, next, nodes,
313				    flags, private))
314			return -EIO;
315	} while (pud++, addr = next, addr != end);
316	return 0;
317}
318
319static inline int check_pgd_range(struct vm_area_struct *vma,
320		unsigned long addr, unsigned long end,
321		const nodemask_t *nodes, unsigned long flags,
322		void *private)
323{
324	pgd_t *pgd;
325	unsigned long next;
326
327	pgd = pgd_offset(vma->vm_mm, addr);
328	do {
329		next = pgd_addr_end(addr, end);
330		if (pgd_none_or_clear_bad(pgd))
331			continue;
332		if (check_pud_range(vma, pgd, addr, next, nodes,
333				    flags, private))
334			return -EIO;
335	} while (pgd++, addr = next, addr != end);
336	return 0;
337}
338
339/*
340 * Check if all pages in a range are on a set of nodes.
341 * If pagelist != NULL then isolate pages from the LRU and
342 * put them on the pagelist.
343 */
344static struct vm_area_struct *
345check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
346		const nodemask_t *nodes, unsigned long flags, void *private)
347{
348	int err;
349	struct vm_area_struct *first, *vma, *prev;
350
351	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
352
353		err = migrate_prep();
354		if (err)
355			return ERR_PTR(err);
356	}
357
358	first = find_vma(mm, start);
359	if (!first)
360		return ERR_PTR(-EFAULT);
361	prev = NULL;
362	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
363		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
364			if (!vma->vm_next && vma->vm_end < end)
365				return ERR_PTR(-EFAULT);
366			if (prev && prev->vm_end < vma->vm_start)
367				return ERR_PTR(-EFAULT);
368		}
369		if (!is_vm_hugetlb_page(vma) &&
370		    ((flags & MPOL_MF_STRICT) ||
371		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
372				vma_migratable(vma)))) {
373			unsigned long endvma = vma->vm_end;
374
375			if (endvma > end)
376				endvma = end;
377			if (vma->vm_start > start)
378				start = vma->vm_start;
379			err = check_pgd_range(vma, start, endvma, nodes,
380						flags, private);
381			if (err) {
382				first = ERR_PTR(err);
383				break;
384			}
385		}
386		prev = vma;
387	}
388	return first;
389}
390
391/* Apply policy to a single VMA */
392static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
393{
394	int err = 0;
395	struct mempolicy *old = vma->vm_policy;
396
397	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
398		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
399		 vma->vm_ops, vma->vm_file,
400		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
401
402	if (vma->vm_ops && vma->vm_ops->set_policy)
403		err = vma->vm_ops->set_policy(vma, new);
404	if (!err) {
405		mpol_get(new);
406		vma->vm_policy = new;
407		mpol_free(old);
408	}
409	return err;
410}
411
412/* Step 2: apply policy to a range and do splits. */
413static int mbind_range(struct vm_area_struct *vma, unsigned long start,
414		       unsigned long end, struct mempolicy *new)
415{
416	struct vm_area_struct *next;
417	int err;
418
419	err = 0;
420	for (; vma && vma->vm_start < end; vma = next) {
421		next = vma->vm_next;
422		if (vma->vm_start < start)
423			err = split_vma(vma->vm_mm, vma, start, 1);
424		if (!err && vma->vm_end > end)
425			err = split_vma(vma->vm_mm, vma, end, 0);
426		if (!err)
427			err = policy_vma(vma, new);
428		if (err)
429			break;
430	}
431	return err;
432}
433
434/*
435 * Update task->flags PF_MEMPOLICY bit: set iff non-default
436 * mempolicy.  Allows more rapid checking of this (combined perhaps
437 * with other PF_* flag bits) on memory allocation hot code paths.
438 *
439 * If called from outside this file, the task 'p' should -only- be
440 * a newly forked child not yet visible on the task list, because
441 * manipulating the task flags of a visible task is not safe.
442 *
443 * The above limitation is why this routine has the funny name
444 * mpol_fix_fork_child_flag().
445 *
446 * It is also safe to call this with a task pointer of current,
447 * which the static wrapper mpol_set_task_struct_flag() does,
448 * for use within this file.
449 */
450
451void mpol_fix_fork_child_flag(struct task_struct *p)
452{
453	if (p->mempolicy)
454		p->flags |= PF_MEMPOLICY;
455	else
456		p->flags &= ~PF_MEMPOLICY;
457}
458
459static void mpol_set_task_struct_flag(void)
460{
461	mpol_fix_fork_child_flag(current);
462}
463
464/* Set the process memory policy */
465static long do_set_mempolicy(int mode, nodemask_t *nodes)
466{
467	struct mempolicy *new;
468
469	if (mpol_check_policy(mode, nodes))
470		return -EINVAL;
471	new = mpol_new(mode, nodes);
472	if (IS_ERR(new))
473		return PTR_ERR(new);
474	mpol_free(current->mempolicy);
475	current->mempolicy = new;
476	mpol_set_task_struct_flag();
477	if (new && new->policy == MPOL_INTERLEAVE)
478		current->il_next = first_node(new->v.nodes);
479	return 0;
480}
481
482/* Fill a zone bitmap for a policy */
483static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
484{
485	nodes_clear(*nodes);
486	switch (p->policy) {
487	case MPOL_DEFAULT:
488		break;
489	case MPOL_BIND:
490		/* Fall through */
491	case MPOL_INTERLEAVE:
492		*nodes = p->v.nodes;
493		break;
494	case MPOL_PREFERRED:
495		/* or use current node instead of memory_map? */
496		if (p->v.preferred_node < 0)
497			*nodes = node_states[N_HIGH_MEMORY];
498		else
499			node_set(p->v.preferred_node, *nodes);
500		break;
501	default:
502		BUG();
503	}
504}
505
506static int lookup_node(struct mm_struct *mm, unsigned long addr)
507{
508	struct page *p;
509	int err;
510
511	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
512	if (err >= 0) {
513		err = page_to_nid(p);
514		put_page(p);
515	}
516	return err;
517}
518
519/* Retrieve NUMA policy */
520static long do_get_mempolicy(int *policy, nodemask_t *nmask,
521			     unsigned long addr, unsigned long flags)
522{
523	int err;
524	struct mm_struct *mm = current->mm;
525	struct vm_area_struct *vma = NULL;
526	struct mempolicy *pol = current->mempolicy;
527
528	cpuset_update_task_memory_state();
529	if (flags &
530		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
531		return -EINVAL;
532
533	if (flags & MPOL_F_MEMS_ALLOWED) {
534		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
535			return -EINVAL;
536		*policy = 0;	/* just so it's initialized */
537		*nmask  = cpuset_current_mems_allowed;
538		return 0;
539	}
540
541	if (flags & MPOL_F_ADDR) {
542		down_read(&mm->mmap_sem);
543		vma = find_vma_intersection(mm, addr, addr+1);
544		if (!vma) {
545			up_read(&mm->mmap_sem);
546			return -EFAULT;
547		}
548		if (vma->vm_ops && vma->vm_ops->get_policy)
549			pol = vma->vm_ops->get_policy(vma, addr);
550		else
551			pol = vma->vm_policy;
552	} else if (addr)
553		return -EINVAL;
554
555	if (!pol)
556		pol = &default_policy;
557
558	if (flags & MPOL_F_NODE) {
559		if (flags & MPOL_F_ADDR) {
560			err = lookup_node(mm, addr);
561			if (err < 0)
562				goto out;
563			*policy = err;
564		} else if (pol == current->mempolicy &&
565				pol->policy == MPOL_INTERLEAVE) {
566			*policy = current->il_next;
567		} else {
568			err = -EINVAL;
569			goto out;
570		}
571	} else
572		*policy = pol->policy;
573
574	if (vma) {
575		up_read(&current->mm->mmap_sem);
576		vma = NULL;
577	}
578
579	err = 0;
580	if (nmask)
581		get_zonemask(pol, nmask);
582
583 out:
584	if (vma)
585		up_read(&current->mm->mmap_sem);
586	return err;
587}
588
589#ifdef CONFIG_MIGRATION
590/*
591 * page migration
592 */
593static void migrate_page_add(struct page *page, struct list_head *pagelist,
594				unsigned long flags)
595{
596	/*
597	 * Avoid migrating a page that is shared with others.
598	 */
599	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
600		isolate_lru_page(page, pagelist);
601}
602
603static struct page *new_node_page(struct page *page, unsigned long node, int **x)
604{
605	return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
606}
607
608/*
609 * Migrate pages from one node to a target node.
610 * Returns error or the number of pages not migrated.
611 */
612static int migrate_to_node(struct mm_struct *mm, int source, int dest,
613			   int flags)
614{
615	nodemask_t nmask;
616	LIST_HEAD(pagelist);
617	int err = 0;
618
619	nodes_clear(nmask);
620	node_set(source, nmask);
621
622	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
623			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
624
625	if (!list_empty(&pagelist))
626		err = migrate_pages(&pagelist, new_node_page, dest);
627
628	return err;
629}
630
631/*
632 * Move pages between the two nodesets so as to preserve the physical
633 * layout as much as possible.
634 *
635 * Returns the number of page that could not be moved.
636 */
637int do_migrate_pages(struct mm_struct *mm,
638	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
639{
640	LIST_HEAD(pagelist);
641	int busy = 0;
642	int err = 0;
643	nodemask_t tmp;
644
645  	down_read(&mm->mmap_sem);
646
647	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
648	if (err)
649		goto out;
650
651/*
652 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
653 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
654 * bit in 'tmp', and return that <source, dest> pair for migration.
655 * The pair of nodemasks 'to' and 'from' define the map.
656 *
657 * If no pair of bits is found that way, fallback to picking some
658 * pair of 'source' and 'dest' bits that are not the same.  If the
659 * 'source' and 'dest' bits are the same, this represents a node
660 * that will be migrating to itself, so no pages need move.
661 *
662 * If no bits are left in 'tmp', or if all remaining bits left
663 * in 'tmp' correspond to the same bit in 'to', return false
664 * (nothing left to migrate).
665 *
666 * This lets us pick a pair of nodes to migrate between, such that
667 * if possible the dest node is not already occupied by some other
668 * source node, minimizing the risk of overloading the memory on a
669 * node that would happen if we migrated incoming memory to a node
670 * before migrating outgoing memory source that same node.
671 *
672 * A single scan of tmp is sufficient.  As we go, we remember the
673 * most recent <s, d> pair that moved (s != d).  If we find a pair
674 * that not only moved, but what's better, moved to an empty slot
675 * (d is not set in tmp), then we break out then, with that pair.
676 * Otherwise when we finish scannng from_tmp, we at least have the
677 * most recent <s, d> pair that moved.  If we get all the way through
678 * the scan of tmp without finding any node that moved, much less
679 * moved to an empty node, then there is nothing left worth migrating.
680 */
681
682	tmp = *from_nodes;
683	while (!nodes_empty(tmp)) {
684		int s,d;
685		int source = -1;
686		int dest = 0;
687
688		for_each_node_mask(s, tmp) {
689			d = node_remap(s, *from_nodes, *to_nodes);
690			if (s == d)
691				continue;
692
693			source = s;	/* Node moved. Memorize */
694			dest = d;
695
696			/* dest not in remaining from nodes? */
697			if (!node_isset(dest, tmp))
698				break;
699		}
700		if (source == -1)
701			break;
702
703		node_clear(source, tmp);
704		err = migrate_to_node(mm, source, dest, flags);
705		if (err > 0)
706			busy += err;
707		if (err < 0)
708			break;
709	}
710out:
711	up_read(&mm->mmap_sem);
712	if (err < 0)
713		return err;
714	return busy;
715
716}
717
718/*
719 * Allocate a new page for page migration based on vma policy.
720 * Start assuming that page is mapped by vma pointed to by @private.
721 * Search forward from there, if not.  N.B., this assumes that the
722 * list of pages handed to migrate_pages()--which is how we get here--
723 * is in virtual address order.
724 */
725static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
726{
727	struct vm_area_struct *vma = (struct vm_area_struct *)private;
728	unsigned long uninitialized_var(address);
729
730	while (vma) {
731		address = page_address_in_vma(page, vma);
732		if (address != -EFAULT)
733			break;
734		vma = vma->vm_next;
735	}
736
737	/*
738	 * if !vma, alloc_page_vma() will use task or system default policy
739	 */
740	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
741}
742#else
743
744static void migrate_page_add(struct page *page, struct list_head *pagelist,
745				unsigned long flags)
746{
747}
748
749int do_migrate_pages(struct mm_struct *mm,
750	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
751{
752	return -ENOSYS;
753}
754
755static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
756{
757	return NULL;
758}
759#endif
760
761static long do_mbind(unsigned long start, unsigned long len,
762		     unsigned long mode, nodemask_t *nmask,
763		     unsigned long flags)
764{
765	struct vm_area_struct *vma;
766	struct mm_struct *mm = current->mm;
767	struct mempolicy *new;
768	unsigned long end;
769	int err;
770	LIST_HEAD(pagelist);
771
772	if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
773				      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
774	    || mode > MPOL_MAX)
775		return -EINVAL;
776	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
777		return -EPERM;
778
779	if (start & ~PAGE_MASK)
780		return -EINVAL;
781
782	if (mode == MPOL_DEFAULT)
783		flags &= ~MPOL_MF_STRICT;
784
785	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
786	end = start + len;
787
788	if (end < start)
789		return -EINVAL;
790	if (end == start)
791		return 0;
792
793	if (mpol_check_policy(mode, nmask))
794		return -EINVAL;
795
796	new = mpol_new(mode, nmask);
797	if (IS_ERR(new))
798		return PTR_ERR(new);
799
800	/*
801	 * If we are using the default policy then operation
802	 * on discontinuous address spaces is okay after all
803	 */
804	if (!new)
805		flags |= MPOL_MF_DISCONTIG_OK;
806
807	pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
808		 mode, nmask ? nodes_addr(*nmask)[0] : -1);
809
810	down_write(&mm->mmap_sem);
811	vma = check_range(mm, start, end, nmask,
812			  flags | MPOL_MF_INVERT, &pagelist);
813
814	err = PTR_ERR(vma);
815	if (!IS_ERR(vma)) {
816		int nr_failed = 0;
817
818		err = mbind_range(vma, start, end, new);
819
820		if (!list_empty(&pagelist))
821			nr_failed = migrate_pages(&pagelist, new_vma_page,
822						(unsigned long)vma);
823
824		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
825			err = -EIO;
826	}
827
828	up_write(&mm->mmap_sem);
829	mpol_free(new);
830	return err;
831}
832
833/*
834 * User space interface with variable sized bitmaps for nodelists.
835 */
836
837/* Copy a node mask from user space. */
838static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
839		     unsigned long maxnode)
840{
841	unsigned long k;
842	unsigned long nlongs;
843	unsigned long endmask;
844
845	--maxnode;
846	nodes_clear(*nodes);
847	if (maxnode == 0 || !nmask)
848		return 0;
849	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
850		return -EINVAL;
851
852	nlongs = BITS_TO_LONGS(maxnode);
853	if ((maxnode % BITS_PER_LONG) == 0)
854		endmask = ~0UL;
855	else
856		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
857
858	/* When the user specified more nodes than supported just check
859	   if the non supported part is all zero. */
860	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
861		if (nlongs > PAGE_SIZE/sizeof(long))
862			return -EINVAL;
863		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
864			unsigned long t;
865			if (get_user(t, nmask + k))
866				return -EFAULT;
867			if (k == nlongs - 1) {
868				if (t & endmask)
869					return -EINVAL;
870			} else if (t)
871				return -EINVAL;
872		}
873		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
874		endmask = ~0UL;
875	}
876
877	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
878		return -EFAULT;
879	nodes_addr(*nodes)[nlongs-1] &= endmask;
880	return 0;
881}
882
883/* Copy a kernel node mask to user space */
884static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
885			      nodemask_t *nodes)
886{
887	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
888	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
889
890	if (copy > nbytes) {
891		if (copy > PAGE_SIZE)
892			return -EINVAL;
893		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
894			return -EFAULT;
895		copy = nbytes;
896	}
897	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
898}
899
900asmlinkage long sys_mbind(unsigned long start, unsigned long len,
901			unsigned long mode,
902			unsigned long __user *nmask, unsigned long maxnode,
903			unsigned flags)
904{
905	nodemask_t nodes;
906	int err;
907
908	err = get_nodes(&nodes, nmask, maxnode);
909	if (err)
910		return err;
911	return do_mbind(start, len, mode, &nodes, flags);
912}
913
914/* Set the process memory policy */
915asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
916		unsigned long maxnode)
917{
918	int err;
919	nodemask_t nodes;
920
921	if (mode < 0 || mode > MPOL_MAX)
922		return -EINVAL;
923	err = get_nodes(&nodes, nmask, maxnode);
924	if (err)
925		return err;
926	return do_set_mempolicy(mode, &nodes);
927}
928
929asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
930		const unsigned long __user *old_nodes,
931		const unsigned long __user *new_nodes)
932{
933	struct mm_struct *mm;
934	struct task_struct *task;
935	nodemask_t old;
936	nodemask_t new;
937	nodemask_t task_nodes;
938	int err;
939
940	err = get_nodes(&old, old_nodes, maxnode);
941	if (err)
942		return err;
943
944	err = get_nodes(&new, new_nodes, maxnode);
945	if (err)
946		return err;
947
948	/* Find the mm_struct */
949	read_lock(&tasklist_lock);
950	task = pid ? find_task_by_vpid(pid) : current;
951	if (!task) {
952		read_unlock(&tasklist_lock);
953		return -ESRCH;
954	}
955	mm = get_task_mm(task);
956	read_unlock(&tasklist_lock);
957
958	if (!mm)
959		return -EINVAL;
960
961	/*
962	 * Check if this process has the right to modify the specified
963	 * process. The right exists if the process has administrative
964	 * capabilities, superuser privileges or the same
965	 * userid as the target process.
966	 */
967	if ((current->euid != task->suid) && (current->euid != task->uid) &&
968	    (current->uid != task->suid) && (current->uid != task->uid) &&
969	    !capable(CAP_SYS_NICE)) {
970		err = -EPERM;
971		goto out;
972	}
973
974	task_nodes = cpuset_mems_allowed(task);
975	/* Is the user allowed to access the target nodes? */
976	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
977		err = -EPERM;
978		goto out;
979	}
980
981	if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
982		err = -EINVAL;
983		goto out;
984	}
985
986	err = security_task_movememory(task);
987	if (err)
988		goto out;
989
990	err = do_migrate_pages(mm, &old, &new,
991		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
992out:
993	mmput(mm);
994	return err;
995}
996
997
998/* Retrieve NUMA policy */
999asmlinkage long sys_get_mempolicy(int __user *policy,
1000				unsigned long __user *nmask,
1001				unsigned long maxnode,
1002				unsigned long addr, unsigned long flags)
1003{
1004	int err;
1005	int uninitialized_var(pval);
1006	nodemask_t nodes;
1007
1008	if (nmask != NULL && maxnode < MAX_NUMNODES)
1009		return -EINVAL;
1010
1011	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1012
1013	if (err)
1014		return err;
1015
1016	if (policy && put_user(pval, policy))
1017		return -EFAULT;
1018
1019	if (nmask)
1020		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1021
1022	return err;
1023}
1024
1025#ifdef CONFIG_COMPAT
1026
1027asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1028				     compat_ulong_t __user *nmask,
1029				     compat_ulong_t maxnode,
1030				     compat_ulong_t addr, compat_ulong_t flags)
1031{
1032	long err;
1033	unsigned long __user *nm = NULL;
1034	unsigned long nr_bits, alloc_size;
1035	DECLARE_BITMAP(bm, MAX_NUMNODES);
1036
1037	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1038	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1039
1040	if (nmask)
1041		nm = compat_alloc_user_space(alloc_size);
1042
1043	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1044
1045	if (!err && nmask) {
1046		err = copy_from_user(bm, nm, alloc_size);
1047		/* ensure entire bitmap is zeroed */
1048		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1049		err |= compat_put_bitmap(nmask, bm, nr_bits);
1050	}
1051
1052	return err;
1053}
1054
1055asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1056				     compat_ulong_t maxnode)
1057{
1058	long err = 0;
1059	unsigned long __user *nm = NULL;
1060	unsigned long nr_bits, alloc_size;
1061	DECLARE_BITMAP(bm, MAX_NUMNODES);
1062
1063	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1064	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1065
1066	if (nmask) {
1067		err = compat_get_bitmap(bm, nmask, nr_bits);
1068		nm = compat_alloc_user_space(alloc_size);
1069		err |= copy_to_user(nm, bm, alloc_size);
1070	}
1071
1072	if (err)
1073		return -EFAULT;
1074
1075	return sys_set_mempolicy(mode, nm, nr_bits+1);
1076}
1077
1078asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1079			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1080			     compat_ulong_t maxnode, compat_ulong_t flags)
1081{
1082	long err = 0;
1083	unsigned long __user *nm = NULL;
1084	unsigned long nr_bits, alloc_size;
1085	nodemask_t bm;
1086
1087	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1088	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1089
1090	if (nmask) {
1091		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1092		nm = compat_alloc_user_space(alloc_size);
1093		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1094	}
1095
1096	if (err)
1097		return -EFAULT;
1098
1099	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1100}
1101
1102#endif
1103
1104/*
1105 * get_vma_policy(@task, @vma, @addr)
1106 * @task - task for fallback if vma policy == default
1107 * @vma   - virtual memory area whose policy is sought
1108 * @addr  - address in @vma for shared policy lookup
1109 *
1110 * Returns effective policy for a VMA at specified address.
1111 * Falls back to @task or system default policy, as necessary.
1112 * Returned policy has extra reference count if shared, vma,
1113 * or some other task's policy [show_numa_maps() can pass
1114 * @task != current].  It is the caller's responsibility to
1115 * free the reference in these cases.
1116 */
1117static struct mempolicy * get_vma_policy(struct task_struct *task,
1118		struct vm_area_struct *vma, unsigned long addr)
1119{
1120	struct mempolicy *pol = task->mempolicy;
1121	int shared_pol = 0;
1122
1123	if (vma) {
1124		if (vma->vm_ops && vma->vm_ops->get_policy) {
1125			pol = vma->vm_ops->get_policy(vma, addr);
1126			shared_pol = 1;	/* if pol non-NULL, add ref below */
1127		} else if (vma->vm_policy &&
1128				vma->vm_policy->policy != MPOL_DEFAULT)
1129			pol = vma->vm_policy;
1130	}
1131	if (!pol)
1132		pol = &default_policy;
1133	else if (!shared_pol && pol != current->mempolicy)
1134		mpol_get(pol);	/* vma or other task's policy */
1135	return pol;
1136}
1137
1138/* Return a nodemask representing a mempolicy */
1139static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
1140{
1141	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1142	if (unlikely(policy->policy == MPOL_BIND) &&
1143			gfp_zone(gfp) >= policy_zone &&
1144			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1145		return &policy->v.nodes;
1146
1147	return NULL;
1148}
1149
1150/* Return a zonelist representing a mempolicy */
1151static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1152{
1153	int nd;
1154
1155	switch (policy->policy) {
1156	case MPOL_PREFERRED:
1157		nd = policy->v.preferred_node;
1158		if (nd < 0)
1159			nd = numa_node_id();
1160		break;
1161	case MPOL_BIND:
1162		/*
1163		 * Normally, MPOL_BIND allocations node-local are node-local
1164		 * within the allowed nodemask. However, if __GFP_THISNODE is
1165		 * set and the current node is part of the mask, we use the
1166		 * the zonelist for the first node in the mask instead.
1167		 */
1168		nd = numa_node_id();
1169		if (unlikely(gfp & __GFP_THISNODE) &&
1170				unlikely(!node_isset(nd, policy->v.nodes)))
1171			nd = first_node(policy->v.nodes);
1172		break;
1173	case MPOL_INTERLEAVE: /* should not happen */
1174	case MPOL_DEFAULT:
1175		nd = numa_node_id();
1176		break;
1177	default:
1178		nd = 0;
1179		BUG();
1180	}
1181	return node_zonelist(nd, gfp);
1182}
1183
1184/* Do dynamic interleaving for a process */
1185static unsigned interleave_nodes(struct mempolicy *policy)
1186{
1187	unsigned nid, next;
1188	struct task_struct *me = current;
1189
1190	nid = me->il_next;
1191	next = next_node(nid, policy->v.nodes);
1192	if (next >= MAX_NUMNODES)
1193		next = first_node(policy->v.nodes);
1194	me->il_next = next;
1195	return nid;
1196}
1197
1198/*
1199 * Depending on the memory policy provide a node from which to allocate the
1200 * next slab entry.
1201 */
1202unsigned slab_node(struct mempolicy *policy)
1203{
1204	int pol = policy ? policy->policy : MPOL_DEFAULT;
1205
1206	switch (pol) {
1207	case MPOL_INTERLEAVE:
1208		return interleave_nodes(policy);
1209
1210	case MPOL_BIND: {
1211		/*
1212		 * Follow bind policy behavior and start allocation at the
1213		 * first node.
1214		 */
1215		struct zonelist *zonelist;
1216		struct zone *zone;
1217		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1218		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1219		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1220							&policy->v.nodes,
1221							&zone);
1222		return zone->node;
1223	}
1224
1225	case MPOL_PREFERRED:
1226		if (policy->v.preferred_node >= 0)
1227			return policy->v.preferred_node;
1228		/* Fall through */
1229
1230	default:
1231		return numa_node_id();
1232	}
1233}
1234
1235/* Do static interleaving for a VMA with known offset. */
1236static unsigned offset_il_node(struct mempolicy *pol,
1237		struct vm_area_struct *vma, unsigned long off)
1238{
1239	unsigned nnodes = nodes_weight(pol->v.nodes);
1240	unsigned target = (unsigned)off % nnodes;
1241	int c;
1242	int nid = -1;
1243
1244	c = 0;
1245	do {
1246		nid = next_node(nid, pol->v.nodes);
1247		c++;
1248	} while (c <= target);
1249	return nid;
1250}
1251
1252/* Determine a node number for interleave */
1253static inline unsigned interleave_nid(struct mempolicy *pol,
1254		 struct vm_area_struct *vma, unsigned long addr, int shift)
1255{
1256	if (vma) {
1257		unsigned long off;
1258
1259		/*
1260		 * for small pages, there is no difference between
1261		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1262		 * for huge pages, since vm_pgoff is in units of small
1263		 * pages, we need to shift off the always 0 bits to get
1264		 * a useful offset.
1265		 */
1266		BUG_ON(shift < PAGE_SHIFT);
1267		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1268		off += (addr - vma->vm_start) >> shift;
1269		return offset_il_node(pol, vma, off);
1270	} else
1271		return interleave_nodes(pol);
1272}
1273
1274#ifdef CONFIG_HUGETLBFS
1275/*
1276 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1277 * @vma = virtual memory area whose policy is sought
1278 * @addr = address in @vma for shared policy lookup and interleave policy
1279 * @gfp_flags = for requested zone
1280 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1281 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1282 *
1283 * Returns a zonelist suitable for a huge page allocation.
1284 * If the effective policy is 'BIND, returns pointer to local node's zonelist,
1285 * and a pointer to the mempolicy's @nodemask for filtering the zonelist.
1286 * If it is also a policy for which get_vma_policy() returns an extra
1287 * reference, we must hold that reference until after the allocation.
1288 * In that case, return policy via @mpol so hugetlb allocation can drop
1289 * the reference. For non-'BIND referenced policies, we can/do drop the
1290 * reference here, so the caller doesn't need to know about the special case
1291 * for default and current task policy.
1292 */
1293struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1294				gfp_t gfp_flags, struct mempolicy **mpol,
1295				nodemask_t **nodemask)
1296{
1297	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1298	struct zonelist *zl;
1299
1300	*mpol = NULL;		/* probably no unref needed */
1301	*nodemask = NULL;	/* assume !MPOL_BIND */
1302	if (pol->policy == MPOL_BIND) {
1303			*nodemask = &pol->v.nodes;
1304	} else if (pol->policy == MPOL_INTERLEAVE) {
1305		unsigned nid;
1306
1307		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1308		if (unlikely(pol != &default_policy &&
1309				pol != current->mempolicy))
1310			__mpol_free(pol);	/* finished with pol */
1311		return node_zonelist(nid, gfp_flags);
1312	}
1313
1314	zl = zonelist_policy(GFP_HIGHUSER, pol);
1315	if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
1316		if (pol->policy != MPOL_BIND)
1317			__mpol_free(pol);	/* finished with pol */
1318		else
1319			*mpol = pol;	/* unref needed after allocation */
1320	}
1321	return zl;
1322}
1323#endif
1324
1325/* Allocate a page in interleaved policy.
1326   Own path because it needs to do special accounting. */
1327static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1328					unsigned nid)
1329{
1330	struct zonelist *zl;
1331	struct page *page;
1332
1333	zl = node_zonelist(nid, gfp);
1334	page = __alloc_pages(gfp, order, zl);
1335	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1336		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1337	return page;
1338}
1339
1340/**
1341 * 	alloc_page_vma	- Allocate a page for a VMA.
1342 *
1343 * 	@gfp:
1344 *      %GFP_USER    user allocation.
1345 *      %GFP_KERNEL  kernel allocations,
1346 *      %GFP_HIGHMEM highmem/user allocations,
1347 *      %GFP_FS      allocation should not call back into a file system.
1348 *      %GFP_ATOMIC  don't sleep.
1349 *
1350 * 	@vma:  Pointer to VMA or NULL if not available.
1351 *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1352 *
1353 * 	This function allocates a page from the kernel page pool and applies
1354 *	a NUMA policy associated with the VMA or the current process.
1355 *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1356 *	mm_struct of the VMA to prevent it from going away. Should be used for
1357 *	all allocations for pages that will be mapped into
1358 * 	user space. Returns NULL when no page can be allocated.
1359 *
1360 *	Should be called with the mm_sem of the vma hold.
1361 */
1362struct page *
1363alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1364{
1365	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1366	struct zonelist *zl;
1367
1368	cpuset_update_task_memory_state();
1369
1370	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1371		unsigned nid;
1372
1373		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1374		if (unlikely(pol != &default_policy &&
1375				pol != current->mempolicy))
1376			__mpol_free(pol);	/* finished with pol */
1377		return alloc_page_interleave(gfp, 0, nid);
1378	}
1379	zl = zonelist_policy(gfp, pol);
1380	if (pol != &default_policy && pol != current->mempolicy) {
1381		/*
1382		 * slow path: ref counted policy -- shared or vma
1383		 */
1384		struct page *page =  __alloc_pages_nodemask(gfp, 0,
1385						zl, nodemask_policy(gfp, pol));
1386		__mpol_free(pol);
1387		return page;
1388	}
1389	/*
1390	 * fast path:  default or task policy
1391	 */
1392	return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
1393}
1394
1395/**
1396 * 	alloc_pages_current - Allocate pages.
1397 *
1398 *	@gfp:
1399 *		%GFP_USER   user allocation,
1400 *      	%GFP_KERNEL kernel allocation,
1401 *      	%GFP_HIGHMEM highmem allocation,
1402 *      	%GFP_FS     don't call back into a file system.
1403 *      	%GFP_ATOMIC don't sleep.
1404 *	@order: Power of two of allocation size in pages. 0 is a single page.
1405 *
1406 *	Allocate a page from the kernel page pool.  When not in
1407 *	interrupt context and apply the current process NUMA policy.
1408 *	Returns NULL when no page can be allocated.
1409 *
1410 *	Don't call cpuset_update_task_memory_state() unless
1411 *	1) it's ok to take cpuset_sem (can WAIT), and
1412 *	2) allocating for current task (not interrupt).
1413 */
1414struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1415{
1416	struct mempolicy *pol = current->mempolicy;
1417
1418	if ((gfp & __GFP_WAIT) && !in_interrupt())
1419		cpuset_update_task_memory_state();
1420	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1421		pol = &default_policy;
1422	if (pol->policy == MPOL_INTERLEAVE)
1423		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1424	return __alloc_pages_nodemask(gfp, order,
1425			zonelist_policy(gfp, pol), nodemask_policy(gfp, pol));
1426}
1427EXPORT_SYMBOL(alloc_pages_current);
1428
1429/*
1430 * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1431 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1432 * with the mems_allowed returned by cpuset_mems_allowed().  This
1433 * keeps mempolicies cpuset relative after its cpuset moves.  See
1434 * further kernel/cpuset.c update_nodemask().
1435 */
1436
1437/* Slow path of a mempolicy copy */
1438struct mempolicy *__mpol_copy(struct mempolicy *old)
1439{
1440	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1441
1442	if (!new)
1443		return ERR_PTR(-ENOMEM);
1444	if (current_cpuset_is_being_rebound()) {
1445		nodemask_t mems = cpuset_mems_allowed(current);
1446		mpol_rebind_policy(old, &mems);
1447	}
1448	*new = *old;
1449	atomic_set(&new->refcnt, 1);
1450	return new;
1451}
1452
1453/* Slow path of a mempolicy comparison */
1454int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1455{
1456	if (!a || !b)
1457		return 0;
1458	if (a->policy != b->policy)
1459		return 0;
1460	switch (a->policy) {
1461	case MPOL_DEFAULT:
1462		return 1;
1463	case MPOL_BIND:
1464		/* Fall through */
1465	case MPOL_INTERLEAVE:
1466		return nodes_equal(a->v.nodes, b->v.nodes);
1467	case MPOL_PREFERRED:
1468		return a->v.preferred_node == b->v.preferred_node;
1469	default:
1470		BUG();
1471		return 0;
1472	}
1473}
1474
1475/* Slow path of a mpol destructor. */
1476void __mpol_free(struct mempolicy *p)
1477{
1478	if (!atomic_dec_and_test(&p->refcnt))
1479		return;
1480	p->policy = MPOL_DEFAULT;
1481	kmem_cache_free(policy_cache, p);
1482}
1483
1484/*
1485 * Shared memory backing store policy support.
1486 *
1487 * Remember policies even when nobody has shared memory mapped.
1488 * The policies are kept in Red-Black tree linked from the inode.
1489 * They are protected by the sp->lock spinlock, which should be held
1490 * for any accesses to the tree.
1491 */
1492
1493/* lookup first element intersecting start-end */
1494/* Caller holds sp->lock */
1495static struct sp_node *
1496sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1497{
1498	struct rb_node *n = sp->root.rb_node;
1499
1500	while (n) {
1501		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1502
1503		if (start >= p->end)
1504			n = n->rb_right;
1505		else if (end <= p->start)
1506			n = n->rb_left;
1507		else
1508			break;
1509	}
1510	if (!n)
1511		return NULL;
1512	for (;;) {
1513		struct sp_node *w = NULL;
1514		struct rb_node *prev = rb_prev(n);
1515		if (!prev)
1516			break;
1517		w = rb_entry(prev, struct sp_node, nd);
1518		if (w->end <= start)
1519			break;
1520		n = prev;
1521	}
1522	return rb_entry(n, struct sp_node, nd);
1523}
1524
1525/* Insert a new shared policy into the list. */
1526/* Caller holds sp->lock */
1527static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1528{
1529	struct rb_node **p = &sp->root.rb_node;
1530	struct rb_node *parent = NULL;
1531	struct sp_node *nd;
1532
1533	while (*p) {
1534		parent = *p;
1535		nd = rb_entry(parent, struct sp_node, nd);
1536		if (new->start < nd->start)
1537			p = &(*p)->rb_left;
1538		else if (new->end > nd->end)
1539			p = &(*p)->rb_right;
1540		else
1541			BUG();
1542	}
1543	rb_link_node(&new->nd, parent, p);
1544	rb_insert_color(&new->nd, &sp->root);
1545	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1546		 new->policy ? new->policy->policy : 0);
1547}
1548
1549/* Find shared policy intersecting idx */
1550struct mempolicy *
1551mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1552{
1553	struct mempolicy *pol = NULL;
1554	struct sp_node *sn;
1555
1556	if (!sp->root.rb_node)
1557		return NULL;
1558	spin_lock(&sp->lock);
1559	sn = sp_lookup(sp, idx, idx+1);
1560	if (sn) {
1561		mpol_get(sn->policy);
1562		pol = sn->policy;
1563	}
1564	spin_unlock(&sp->lock);
1565	return pol;
1566}
1567
1568static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1569{
1570	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1571	rb_erase(&n->nd, &sp->root);
1572	mpol_free(n->policy);
1573	kmem_cache_free(sn_cache, n);
1574}
1575
1576static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1577				struct mempolicy *pol)
1578{
1579	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1580
1581	if (!n)
1582		return NULL;
1583	n->start = start;
1584	n->end = end;
1585	mpol_get(pol);
1586	n->policy = pol;
1587	return n;
1588}
1589
1590/* Replace a policy range. */
1591static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1592				 unsigned long end, struct sp_node *new)
1593{
1594	struct sp_node *n, *new2 = NULL;
1595
1596restart:
1597	spin_lock(&sp->lock);
1598	n = sp_lookup(sp, start, end);
1599	/* Take care of old policies in the same range. */
1600	while (n && n->start < end) {
1601		struct rb_node *next = rb_next(&n->nd);
1602		if (n->start >= start) {
1603			if (n->end <= end)
1604				sp_delete(sp, n);
1605			else
1606				n->start = end;
1607		} else {
1608			/* Old policy spanning whole new range. */
1609			if (n->end > end) {
1610				if (!new2) {
1611					spin_unlock(&sp->lock);
1612					new2 = sp_alloc(end, n->end, n->policy);
1613					if (!new2)
1614						return -ENOMEM;
1615					goto restart;
1616				}
1617				n->end = start;
1618				sp_insert(sp, new2);
1619				new2 = NULL;
1620				break;
1621			} else
1622				n->end = start;
1623		}
1624		if (!next)
1625			break;
1626		n = rb_entry(next, struct sp_node, nd);
1627	}
1628	if (new)
1629		sp_insert(sp, new);
1630	spin_unlock(&sp->lock);
1631	if (new2) {
1632		mpol_free(new2->policy);
1633		kmem_cache_free(sn_cache, new2);
1634	}
1635	return 0;
1636}
1637
1638void mpol_shared_policy_init(struct shared_policy *info, int policy,
1639				nodemask_t *policy_nodes)
1640{
1641	info->root = RB_ROOT;
1642	spin_lock_init(&info->lock);
1643
1644	if (policy != MPOL_DEFAULT) {
1645		struct mempolicy *newpol;
1646
1647		/* Falls back to MPOL_DEFAULT on any error */
1648		newpol = mpol_new(policy, policy_nodes);
1649		if (!IS_ERR(newpol)) {
1650			/* Create pseudo-vma that contains just the policy */
1651			struct vm_area_struct pvma;
1652
1653			memset(&pvma, 0, sizeof(struct vm_area_struct));
1654			/* Policy covers entire file */
1655			pvma.vm_end = TASK_SIZE;
1656			mpol_set_shared_policy(info, &pvma, newpol);
1657			mpol_free(newpol);
1658		}
1659	}
1660}
1661
1662int mpol_set_shared_policy(struct shared_policy *info,
1663			struct vm_area_struct *vma, struct mempolicy *npol)
1664{
1665	int err;
1666	struct sp_node *new = NULL;
1667	unsigned long sz = vma_pages(vma);
1668
1669	pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
1670		 vma->vm_pgoff,
1671		 sz, npol? npol->policy : -1,
1672		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1673
1674	if (npol) {
1675		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1676		if (!new)
1677			return -ENOMEM;
1678	}
1679	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1680	if (err && new)
1681		kmem_cache_free(sn_cache, new);
1682	return err;
1683}
1684
1685/* Free a backing policy store on inode delete. */
1686void mpol_free_shared_policy(struct shared_policy *p)
1687{
1688	struct sp_node *n;
1689	struct rb_node *next;
1690
1691	if (!p->root.rb_node)
1692		return;
1693	spin_lock(&p->lock);
1694	next = rb_first(&p->root);
1695	while (next) {
1696		n = rb_entry(next, struct sp_node, nd);
1697		next = rb_next(&n->nd);
1698		rb_erase(&n->nd, &p->root);
1699		mpol_free(n->policy);
1700		kmem_cache_free(sn_cache, n);
1701	}
1702	spin_unlock(&p->lock);
1703}
1704
1705/* assumes fs == KERNEL_DS */
1706void __init numa_policy_init(void)
1707{
1708	nodemask_t interleave_nodes;
1709	unsigned long largest = 0;
1710	int nid, prefer = 0;
1711
1712	policy_cache = kmem_cache_create("numa_policy",
1713					 sizeof(struct mempolicy),
1714					 0, SLAB_PANIC, NULL);
1715
1716	sn_cache = kmem_cache_create("shared_policy_node",
1717				     sizeof(struct sp_node),
1718				     0, SLAB_PANIC, NULL);
1719
1720	/*
1721	 * Set interleaving policy for system init. Interleaving is only
1722	 * enabled across suitably sized nodes (default is >= 16MB), or
1723	 * fall back to the largest node if they're all smaller.
1724	 */
1725	nodes_clear(interleave_nodes);
1726	for_each_node_state(nid, N_HIGH_MEMORY) {
1727		unsigned long total_pages = node_present_pages(nid);
1728
1729		/* Preserve the largest node */
1730		if (largest < total_pages) {
1731			largest = total_pages;
1732			prefer = nid;
1733		}
1734
1735		/* Interleave this node? */
1736		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1737			node_set(nid, interleave_nodes);
1738	}
1739
1740	/* All too small, use the largest */
1741	if (unlikely(nodes_empty(interleave_nodes)))
1742		node_set(prefer, interleave_nodes);
1743
1744	if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
1745		printk("numa_policy_init: interleaving failed\n");
1746}
1747
1748/* Reset policy of current process to default */
1749void numa_default_policy(void)
1750{
1751	do_set_mempolicy(MPOL_DEFAULT, NULL);
1752}
1753
1754/* Migrate a policy to a different set of nodes */
1755static void mpol_rebind_policy(struct mempolicy *pol,
1756			       const nodemask_t *newmask)
1757{
1758	nodemask_t *mpolmask;
1759	nodemask_t tmp;
1760
1761	if (!pol)
1762		return;
1763	mpolmask = &pol->cpuset_mems_allowed;
1764	if (nodes_equal(*mpolmask, *newmask))
1765		return;
1766
1767	switch (pol->policy) {
1768	case MPOL_DEFAULT:
1769		break;
1770	case MPOL_BIND:
1771		/* Fall through */
1772	case MPOL_INTERLEAVE:
1773		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1774		pol->v.nodes = tmp;
1775		*mpolmask = *newmask;
1776		current->il_next = node_remap(current->il_next,
1777						*mpolmask, *newmask);
1778		break;
1779	case MPOL_PREFERRED:
1780		pol->v.preferred_node = node_remap(pol->v.preferred_node,
1781						*mpolmask, *newmask);
1782		*mpolmask = *newmask;
1783		break;
1784	default:
1785		BUG();
1786		break;
1787	}
1788}
1789
1790/*
1791 * Wrapper for mpol_rebind_policy() that just requires task
1792 * pointer, and updates task mempolicy.
1793 */
1794
1795void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1796{
1797	mpol_rebind_policy(tsk->mempolicy, new);
1798}
1799
1800/*
1801 * Rebind each vma in mm to new nodemask.
1802 *
1803 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1804 */
1805
1806void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1807{
1808	struct vm_area_struct *vma;
1809
1810	down_write(&mm->mmap_sem);
1811	for (vma = mm->mmap; vma; vma = vma->vm_next)
1812		mpol_rebind_policy(vma->vm_policy, new);
1813	up_write(&mm->mmap_sem);
1814}
1815
1816/*
1817 * Display pages allocated per node and memory policy via /proc.
1818 */
1819
1820static const char * const policy_types[] =
1821	{ "default", "prefer", "bind", "interleave" };
1822
1823/*
1824 * Convert a mempolicy into a string.
1825 * Returns the number of characters in buffer (if positive)
1826 * or an error (negative)
1827 */
1828static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1829{
1830	char *p = buffer;
1831	int l;
1832	nodemask_t nodes;
1833	int mode = pol ? pol->policy : MPOL_DEFAULT;
1834
1835	switch (mode) {
1836	case MPOL_DEFAULT:
1837		nodes_clear(nodes);
1838		break;
1839
1840	case MPOL_PREFERRED:
1841		nodes_clear(nodes);
1842		node_set(pol->v.preferred_node, nodes);
1843		break;
1844
1845	case MPOL_BIND:
1846		/* Fall through */
1847	case MPOL_INTERLEAVE:
1848		nodes = pol->v.nodes;
1849		break;
1850
1851	default:
1852		BUG();
1853		return -EFAULT;
1854	}
1855
1856	l = strlen(policy_types[mode]);
1857 	if (buffer + maxlen < p + l + 1)
1858 		return -ENOSPC;
1859
1860	strcpy(p, policy_types[mode]);
1861	p += l;
1862
1863	if (!nodes_empty(nodes)) {
1864		if (buffer + maxlen < p + 2)
1865			return -ENOSPC;
1866		*p++ = '=';
1867	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1868	}
1869	return p - buffer;
1870}
1871
1872struct numa_maps {
1873	unsigned long pages;
1874	unsigned long anon;
1875	unsigned long active;
1876	unsigned long writeback;
1877	unsigned long mapcount_max;
1878	unsigned long dirty;
1879	unsigned long swapcache;
1880	unsigned long node[MAX_NUMNODES];
1881};
1882
1883static void gather_stats(struct page *page, void *private, int pte_dirty)
1884{
1885	struct numa_maps *md = private;
1886	int count = page_mapcount(page);
1887
1888	md->pages++;
1889	if (pte_dirty || PageDirty(page))
1890		md->dirty++;
1891
1892	if (PageSwapCache(page))
1893		md->swapcache++;
1894
1895	if (PageActive(page))
1896		md->active++;
1897
1898	if (PageWriteback(page))
1899		md->writeback++;
1900
1901	if (PageAnon(page))
1902		md->anon++;
1903
1904	if (count > md->mapcount_max)
1905		md->mapcount_max = count;
1906
1907	md->node[page_to_nid(page)]++;
1908}
1909
1910#ifdef CONFIG_HUGETLB_PAGE
1911static void check_huge_range(struct vm_area_struct *vma,
1912		unsigned long start, unsigned long end,
1913		struct numa_maps *md)
1914{
1915	unsigned long addr;
1916	struct page *page;
1917
1918	for (addr = start; addr < end; addr += HPAGE_SIZE) {
1919		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1920		pte_t pte;
1921
1922		if (!ptep)
1923			continue;
1924
1925		pte = *ptep;
1926		if (pte_none(pte))
1927			continue;
1928
1929		page = pte_page(pte);
1930		if (!page)
1931			continue;
1932
1933		gather_stats(page, md, pte_dirty(*ptep));
1934	}
1935}
1936#else
1937static inline void check_huge_range(struct vm_area_struct *vma,
1938		unsigned long start, unsigned long end,
1939		struct numa_maps *md)
1940{
1941}
1942#endif
1943
1944int show_numa_map(struct seq_file *m, void *v)
1945{
1946	struct proc_maps_private *priv = m->private;
1947	struct vm_area_struct *vma = v;
1948	struct numa_maps *md;
1949	struct file *file = vma->vm_file;
1950	struct mm_struct *mm = vma->vm_mm;
1951	struct mempolicy *pol;
1952	int n;
1953	char buffer[50];
1954
1955	if (!mm)
1956		return 0;
1957
1958	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1959	if (!md)
1960		return 0;
1961
1962	pol = get_vma_policy(priv->task, vma, vma->vm_start);
1963	mpol_to_str(buffer, sizeof(buffer), pol);
1964	/*
1965	 * unref shared or other task's mempolicy
1966	 */
1967	if (pol != &default_policy && pol != current->mempolicy)
1968		__mpol_free(pol);
1969
1970	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1971
1972	if (file) {
1973		seq_printf(m, " file=");
1974		seq_path(m, &file->f_path, "\n\t= ");
1975	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1976		seq_printf(m, " heap");
1977	} else if (vma->vm_start <= mm->start_stack &&
1978			vma->vm_end >= mm->start_stack) {
1979		seq_printf(m, " stack");
1980	}
1981
1982	if (is_vm_hugetlb_page(vma)) {
1983		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1984		seq_printf(m, " huge");
1985	} else {
1986		check_pgd_range(vma, vma->vm_start, vma->vm_end,
1987			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
1988	}
1989
1990	if (!md->pages)
1991		goto out;
1992
1993	if (md->anon)
1994		seq_printf(m," anon=%lu",md->anon);
1995
1996	if (md->dirty)
1997		seq_printf(m," dirty=%lu",md->dirty);
1998
1999	if (md->pages != md->anon && md->pages != md->dirty)
2000		seq_printf(m, " mapped=%lu", md->pages);
2001
2002	if (md->mapcount_max > 1)
2003		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2004
2005	if (md->swapcache)
2006		seq_printf(m," swapcache=%lu", md->swapcache);
2007
2008	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2009		seq_printf(m," active=%lu", md->active);
2010
2011	if (md->writeback)
2012		seq_printf(m," writeback=%lu", md->writeback);
2013
2014	for_each_node_state(n, N_HIGH_MEMORY)
2015		if (md->node[n])
2016			seq_printf(m, " N%d=%lu", n, md->node[n]);
2017out:
2018	seq_putc(m, '\n');
2019	kfree(md);
2020
2021	if (m->count < m->size)
2022		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2023	return 0;
2024}
2025