mempolicy.c revision 52cd3b074050dd664380b5e8cfc85d4a6ed8ad48
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave     Allocate memory interleaved over a set of nodes,
16 *                with normal fallback if it fails.
17 *                For VMA based allocations this interleaves based on the
18 *                offset into the backing object or offset into the mapping
19 *                for anonymous memory. For process policy an process counter
20 *                is used.
21 *
22 * bind           Only allocate memory on a specific set of nodes,
23 *                no fallback.
24 *                FIXME: memory is allocated starting with the first node
25 *                to the last. It would be better if bind would truly restrict
26 *                the allocation to memory nodes instead
27 *
28 * preferred       Try a specific node first before normal fallback.
29 *                As a special case node -1 here means do the allocation
30 *                on the local CPU. This is normally identical to default,
31 *                but useful to set in a VMA when you have a non default
32 *                process policy.
33 *
34 * default        Allocate on the local node first, or when on a VMA
35 *                use the process policy. This is what Linux always did
36 *		  in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57   fix mmap readahead to honour policy and enable policy for any page cache
58   object
59   statistics for bigpages
60   global policy for page cache? currently it uses process policy. Requires
61   first item above.
62   handle mremap for shared memory (currently ignored for the policy)
63   grows down?
64   make bind policy root only? It can trigger oom much faster and the
65   kernel is not always grateful with that.
66*/
67
68#include <linux/mempolicy.h>
69#include <linux/mm.h>
70#include <linux/highmem.h>
71#include <linux/hugetlb.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
74#include <linux/nodemask.h>
75#include <linux/cpuset.h>
76#include <linux/gfp.h>
77#include <linux/slab.h>
78#include <linux/string.h>
79#include <linux/module.h>
80#include <linux/nsproxy.h>
81#include <linux/interrupt.h>
82#include <linux/init.h>
83#include <linux/compat.h>
84#include <linux/swap.h>
85#include <linux/seq_file.h>
86#include <linux/proc_fs.h>
87#include <linux/migrate.h>
88#include <linux/rmap.h>
89#include <linux/security.h>
90#include <linux/syscalls.h>
91
92#include <asm/tlbflush.h>
93#include <asm/uaccess.h>
94
95/* Internal flags */
96#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
97#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
98#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
99
100static struct kmem_cache *policy_cache;
101static struct kmem_cache *sn_cache;
102
103/* Highest zone. An specific allocation for a zone below that is not
104   policied. */
105enum zone_type policy_zone = 0;
106
107struct mempolicy default_policy = {
108	.refcnt = ATOMIC_INIT(1), /* never free it */
109	.mode   = MPOL_DEFAULT,
110};
111
112static const struct mempolicy_operations {
113	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
114	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
115} mpol_ops[MPOL_MAX];
116
117/* Check that the nodemask contains at least one populated zone */
118static int is_valid_nodemask(const nodemask_t *nodemask)
119{
120	int nd, k;
121
122	/* Check that there is something useful in this mask */
123	k = policy_zone;
124
125	for_each_node_mask(nd, *nodemask) {
126		struct zone *z;
127
128		for (k = 0; k <= policy_zone; k++) {
129			z = &NODE_DATA(nd)->node_zones[k];
130			if (z->present_pages > 0)
131				return 1;
132		}
133	}
134
135	return 0;
136}
137
138static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
139{
140	return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
141}
142
143static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
144				   const nodemask_t *rel)
145{
146	nodemask_t tmp;
147	nodes_fold(tmp, *orig, nodes_weight(*rel));
148	nodes_onto(*ret, tmp, *rel);
149}
150
151static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
152{
153	if (nodes_empty(*nodes))
154		return -EINVAL;
155	pol->v.nodes = *nodes;
156	return 0;
157}
158
159static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
160{
161	if (!nodes)
162		pol->v.preferred_node = -1;	/* local allocation */
163	else if (nodes_empty(*nodes))
164		return -EINVAL;			/*  no allowed nodes */
165	else
166		pol->v.preferred_node = first_node(*nodes);
167	return 0;
168}
169
170static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
171{
172	if (!is_valid_nodemask(nodes))
173		return -EINVAL;
174	pol->v.nodes = *nodes;
175	return 0;
176}
177
178/* Create a new policy */
179static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
180				  nodemask_t *nodes)
181{
182	struct mempolicy *policy;
183	nodemask_t cpuset_context_nmask;
184	int ret;
185
186	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
187		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
188
189	if (mode == MPOL_DEFAULT) {
190		if (nodes && !nodes_empty(*nodes))
191			return ERR_PTR(-EINVAL);
192		return NULL;
193	}
194	VM_BUG_ON(!nodes);
195
196	/*
197	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
198	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
199	 * All other modes require a valid pointer to a non-empty nodemask.
200	 */
201	if (mode == MPOL_PREFERRED) {
202		if (nodes_empty(*nodes)) {
203			if (((flags & MPOL_F_STATIC_NODES) ||
204			     (flags & MPOL_F_RELATIVE_NODES)))
205				return ERR_PTR(-EINVAL);
206			nodes = NULL;	/* flag local alloc */
207		}
208	} else if (nodes_empty(*nodes))
209		return ERR_PTR(-EINVAL);
210	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
211	if (!policy)
212		return ERR_PTR(-ENOMEM);
213	atomic_set(&policy->refcnt, 1);
214	policy->mode = mode;
215	policy->flags = flags;
216
217	if (nodes) {
218		/*
219		 * cpuset related setup doesn't apply to local allocation
220		 */
221		cpuset_update_task_memory_state();
222		if (flags & MPOL_F_RELATIVE_NODES)
223			mpol_relative_nodemask(&cpuset_context_nmask, nodes,
224					       &cpuset_current_mems_allowed);
225		else
226			nodes_and(cpuset_context_nmask, *nodes,
227				  cpuset_current_mems_allowed);
228		if (mpol_store_user_nodemask(policy))
229			policy->w.user_nodemask = *nodes;
230		else
231			policy->w.cpuset_mems_allowed =
232						cpuset_mems_allowed(current);
233	}
234
235	ret = mpol_ops[mode].create(policy,
236				nodes ? &cpuset_context_nmask : NULL);
237	if (ret < 0) {
238		kmem_cache_free(policy_cache, policy);
239		return ERR_PTR(ret);
240	}
241	return policy;
242}
243
244/* Slow path of a mpol destructor. */
245void __mpol_put(struct mempolicy *p)
246{
247	if (!atomic_dec_and_test(&p->refcnt))
248		return;
249	p->mode = MPOL_DEFAULT;
250	kmem_cache_free(policy_cache, p);
251}
252
253static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
254{
255}
256
257static void mpol_rebind_nodemask(struct mempolicy *pol,
258				 const nodemask_t *nodes)
259{
260	nodemask_t tmp;
261
262	if (pol->flags & MPOL_F_STATIC_NODES)
263		nodes_and(tmp, pol->w.user_nodemask, *nodes);
264	else if (pol->flags & MPOL_F_RELATIVE_NODES)
265		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
266	else {
267		nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
268			    *nodes);
269		pol->w.cpuset_mems_allowed = *nodes;
270	}
271
272	pol->v.nodes = tmp;
273	if (!node_isset(current->il_next, tmp)) {
274		current->il_next = next_node(current->il_next, tmp);
275		if (current->il_next >= MAX_NUMNODES)
276			current->il_next = first_node(tmp);
277		if (current->il_next >= MAX_NUMNODES)
278			current->il_next = numa_node_id();
279	}
280}
281
282static void mpol_rebind_preferred(struct mempolicy *pol,
283				  const nodemask_t *nodes)
284{
285	nodemask_t tmp;
286
287	if (pol->flags & MPOL_F_STATIC_NODES) {
288		int node = first_node(pol->w.user_nodemask);
289
290		if (node_isset(node, *nodes))
291			pol->v.preferred_node = node;
292		else
293			pol->v.preferred_node = -1;
294	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
295		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
296		pol->v.preferred_node = first_node(tmp);
297	} else if (pol->v.preferred_node != -1) {
298		pol->v.preferred_node = node_remap(pol->v.preferred_node,
299						   pol->w.cpuset_mems_allowed,
300						   *nodes);
301		pol->w.cpuset_mems_allowed = *nodes;
302	}
303}
304
305/* Migrate a policy to a different set of nodes */
306static void mpol_rebind_policy(struct mempolicy *pol,
307			       const nodemask_t *newmask)
308{
309	if (!pol)
310		return;
311	if (!mpol_store_user_nodemask(pol) &&
312	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
313		return;
314	mpol_ops[pol->mode].rebind(pol, newmask);
315}
316
317/*
318 * Wrapper for mpol_rebind_policy() that just requires task
319 * pointer, and updates task mempolicy.
320 */
321
322void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
323{
324	mpol_rebind_policy(tsk->mempolicy, new);
325}
326
327/*
328 * Rebind each vma in mm to new nodemask.
329 *
330 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
331 */
332
333void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
334{
335	struct vm_area_struct *vma;
336
337	down_write(&mm->mmap_sem);
338	for (vma = mm->mmap; vma; vma = vma->vm_next)
339		mpol_rebind_policy(vma->vm_policy, new);
340	up_write(&mm->mmap_sem);
341}
342
343static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
344	[MPOL_DEFAULT] = {
345		.rebind = mpol_rebind_default,
346	},
347	[MPOL_INTERLEAVE] = {
348		.create = mpol_new_interleave,
349		.rebind = mpol_rebind_nodemask,
350	},
351	[MPOL_PREFERRED] = {
352		.create = mpol_new_preferred,
353		.rebind = mpol_rebind_preferred,
354	},
355	[MPOL_BIND] = {
356		.create = mpol_new_bind,
357		.rebind = mpol_rebind_nodemask,
358	},
359};
360
361static void gather_stats(struct page *, void *, int pte_dirty);
362static void migrate_page_add(struct page *page, struct list_head *pagelist,
363				unsigned long flags);
364
365/* Scan through pages checking if pages follow certain conditions. */
366static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
367		unsigned long addr, unsigned long end,
368		const nodemask_t *nodes, unsigned long flags,
369		void *private)
370{
371	pte_t *orig_pte;
372	pte_t *pte;
373	spinlock_t *ptl;
374
375	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
376	do {
377		struct page *page;
378		int nid;
379
380		if (!pte_present(*pte))
381			continue;
382		page = vm_normal_page(vma, addr, *pte);
383		if (!page)
384			continue;
385		/*
386		 * The check for PageReserved here is important to avoid
387		 * handling zero pages and other pages that may have been
388		 * marked special by the system.
389		 *
390		 * If the PageReserved would not be checked here then f.e.
391		 * the location of the zero page could have an influence
392		 * on MPOL_MF_STRICT, zero pages would be counted for
393		 * the per node stats, and there would be useless attempts
394		 * to put zero pages on the migration list.
395		 */
396		if (PageReserved(page))
397			continue;
398		nid = page_to_nid(page);
399		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
400			continue;
401
402		if (flags & MPOL_MF_STATS)
403			gather_stats(page, private, pte_dirty(*pte));
404		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
405			migrate_page_add(page, private, flags);
406		else
407			break;
408	} while (pte++, addr += PAGE_SIZE, addr != end);
409	pte_unmap_unlock(orig_pte, ptl);
410	return addr != end;
411}
412
413static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
414		unsigned long addr, unsigned long end,
415		const nodemask_t *nodes, unsigned long flags,
416		void *private)
417{
418	pmd_t *pmd;
419	unsigned long next;
420
421	pmd = pmd_offset(pud, addr);
422	do {
423		next = pmd_addr_end(addr, end);
424		if (pmd_none_or_clear_bad(pmd))
425			continue;
426		if (check_pte_range(vma, pmd, addr, next, nodes,
427				    flags, private))
428			return -EIO;
429	} while (pmd++, addr = next, addr != end);
430	return 0;
431}
432
433static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
434		unsigned long addr, unsigned long end,
435		const nodemask_t *nodes, unsigned long flags,
436		void *private)
437{
438	pud_t *pud;
439	unsigned long next;
440
441	pud = pud_offset(pgd, addr);
442	do {
443		next = pud_addr_end(addr, end);
444		if (pud_none_or_clear_bad(pud))
445			continue;
446		if (check_pmd_range(vma, pud, addr, next, nodes,
447				    flags, private))
448			return -EIO;
449	} while (pud++, addr = next, addr != end);
450	return 0;
451}
452
453static inline int check_pgd_range(struct vm_area_struct *vma,
454		unsigned long addr, unsigned long end,
455		const nodemask_t *nodes, unsigned long flags,
456		void *private)
457{
458	pgd_t *pgd;
459	unsigned long next;
460
461	pgd = pgd_offset(vma->vm_mm, addr);
462	do {
463		next = pgd_addr_end(addr, end);
464		if (pgd_none_or_clear_bad(pgd))
465			continue;
466		if (check_pud_range(vma, pgd, addr, next, nodes,
467				    flags, private))
468			return -EIO;
469	} while (pgd++, addr = next, addr != end);
470	return 0;
471}
472
473/*
474 * Check if all pages in a range are on a set of nodes.
475 * If pagelist != NULL then isolate pages from the LRU and
476 * put them on the pagelist.
477 */
478static struct vm_area_struct *
479check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
480		const nodemask_t *nodes, unsigned long flags, void *private)
481{
482	int err;
483	struct vm_area_struct *first, *vma, *prev;
484
485	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
486
487		err = migrate_prep();
488		if (err)
489			return ERR_PTR(err);
490	}
491
492	first = find_vma(mm, start);
493	if (!first)
494		return ERR_PTR(-EFAULT);
495	prev = NULL;
496	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
497		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
498			if (!vma->vm_next && vma->vm_end < end)
499				return ERR_PTR(-EFAULT);
500			if (prev && prev->vm_end < vma->vm_start)
501				return ERR_PTR(-EFAULT);
502		}
503		if (!is_vm_hugetlb_page(vma) &&
504		    ((flags & MPOL_MF_STRICT) ||
505		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
506				vma_migratable(vma)))) {
507			unsigned long endvma = vma->vm_end;
508
509			if (endvma > end)
510				endvma = end;
511			if (vma->vm_start > start)
512				start = vma->vm_start;
513			err = check_pgd_range(vma, start, endvma, nodes,
514						flags, private);
515			if (err) {
516				first = ERR_PTR(err);
517				break;
518			}
519		}
520		prev = vma;
521	}
522	return first;
523}
524
525/* Apply policy to a single VMA */
526static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
527{
528	int err = 0;
529	struct mempolicy *old = vma->vm_policy;
530
531	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
532		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
533		 vma->vm_ops, vma->vm_file,
534		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
535
536	if (vma->vm_ops && vma->vm_ops->set_policy)
537		err = vma->vm_ops->set_policy(vma, new);
538	if (!err) {
539		mpol_get(new);
540		vma->vm_policy = new;
541		mpol_put(old);
542	}
543	return err;
544}
545
546/* Step 2: apply policy to a range and do splits. */
547static int mbind_range(struct vm_area_struct *vma, unsigned long start,
548		       unsigned long end, struct mempolicy *new)
549{
550	struct vm_area_struct *next;
551	int err;
552
553	err = 0;
554	for (; vma && vma->vm_start < end; vma = next) {
555		next = vma->vm_next;
556		if (vma->vm_start < start)
557			err = split_vma(vma->vm_mm, vma, start, 1);
558		if (!err && vma->vm_end > end)
559			err = split_vma(vma->vm_mm, vma, end, 0);
560		if (!err)
561			err = policy_vma(vma, new);
562		if (err)
563			break;
564	}
565	return err;
566}
567
568/*
569 * Update task->flags PF_MEMPOLICY bit: set iff non-default
570 * mempolicy.  Allows more rapid checking of this (combined perhaps
571 * with other PF_* flag bits) on memory allocation hot code paths.
572 *
573 * If called from outside this file, the task 'p' should -only- be
574 * a newly forked child not yet visible on the task list, because
575 * manipulating the task flags of a visible task is not safe.
576 *
577 * The above limitation is why this routine has the funny name
578 * mpol_fix_fork_child_flag().
579 *
580 * It is also safe to call this with a task pointer of current,
581 * which the static wrapper mpol_set_task_struct_flag() does,
582 * for use within this file.
583 */
584
585void mpol_fix_fork_child_flag(struct task_struct *p)
586{
587	if (p->mempolicy)
588		p->flags |= PF_MEMPOLICY;
589	else
590		p->flags &= ~PF_MEMPOLICY;
591}
592
593static void mpol_set_task_struct_flag(void)
594{
595	mpol_fix_fork_child_flag(current);
596}
597
598/* Set the process memory policy */
599static long do_set_mempolicy(unsigned short mode, unsigned short flags,
600			     nodemask_t *nodes)
601{
602	struct mempolicy *new;
603	struct mm_struct *mm = current->mm;
604
605	new = mpol_new(mode, flags, nodes);
606	if (IS_ERR(new))
607		return PTR_ERR(new);
608
609	/*
610	 * prevent changing our mempolicy while show_numa_maps()
611	 * is using it.
612	 * Note:  do_set_mempolicy() can be called at init time
613	 * with no 'mm'.
614	 */
615	if (mm)
616		down_write(&mm->mmap_sem);
617	mpol_put(current->mempolicy);
618	current->mempolicy = new;
619	mpol_set_task_struct_flag();
620	if (new && new->mode == MPOL_INTERLEAVE &&
621	    nodes_weight(new->v.nodes))
622		current->il_next = first_node(new->v.nodes);
623	if (mm)
624		up_write(&mm->mmap_sem);
625
626	return 0;
627}
628
629/* Fill a zone bitmap for a policy */
630static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
631{
632	nodes_clear(*nodes);
633	switch (p->mode) {
634	case MPOL_DEFAULT:
635		break;
636	case MPOL_BIND:
637		/* Fall through */
638	case MPOL_INTERLEAVE:
639		*nodes = p->v.nodes;
640		break;
641	case MPOL_PREFERRED:
642		/* or use current node instead of memory_map? */
643		if (p->v.preferred_node < 0)
644			*nodes = node_states[N_HIGH_MEMORY];
645		else
646			node_set(p->v.preferred_node, *nodes);
647		break;
648	default:
649		BUG();
650	}
651}
652
653static int lookup_node(struct mm_struct *mm, unsigned long addr)
654{
655	struct page *p;
656	int err;
657
658	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
659	if (err >= 0) {
660		err = page_to_nid(p);
661		put_page(p);
662	}
663	return err;
664}
665
666/* Retrieve NUMA policy */
667static long do_get_mempolicy(int *policy, nodemask_t *nmask,
668			     unsigned long addr, unsigned long flags)
669{
670	int err;
671	struct mm_struct *mm = current->mm;
672	struct vm_area_struct *vma = NULL;
673	struct mempolicy *pol = current->mempolicy;
674
675	cpuset_update_task_memory_state();
676	if (flags &
677		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
678		return -EINVAL;
679
680	if (flags & MPOL_F_MEMS_ALLOWED) {
681		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
682			return -EINVAL;
683		*policy = 0;	/* just so it's initialized */
684		*nmask  = cpuset_current_mems_allowed;
685		return 0;
686	}
687
688	if (flags & MPOL_F_ADDR) {
689		down_read(&mm->mmap_sem);
690		vma = find_vma_intersection(mm, addr, addr+1);
691		if (!vma) {
692			up_read(&mm->mmap_sem);
693			return -EFAULT;
694		}
695		if (vma->vm_ops && vma->vm_ops->get_policy)
696			pol = vma->vm_ops->get_policy(vma, addr);
697		else
698			pol = vma->vm_policy;
699	} else if (addr)
700		return -EINVAL;
701
702	if (!pol)
703		pol = &default_policy;
704
705	if (flags & MPOL_F_NODE) {
706		if (flags & MPOL_F_ADDR) {
707			err = lookup_node(mm, addr);
708			if (err < 0)
709				goto out;
710			*policy = err;
711		} else if (pol == current->mempolicy &&
712				pol->mode == MPOL_INTERLEAVE) {
713			*policy = current->il_next;
714		} else {
715			err = -EINVAL;
716			goto out;
717		}
718	} else
719		*policy = pol->mode | pol->flags;
720
721	if (vma) {
722		up_read(&current->mm->mmap_sem);
723		vma = NULL;
724	}
725
726	err = 0;
727	if (nmask)
728		get_zonemask(pol, nmask);
729
730 out:
731	mpol_cond_put(pol);
732	if (vma)
733		up_read(&current->mm->mmap_sem);
734	return err;
735}
736
737#ifdef CONFIG_MIGRATION
738/*
739 * page migration
740 */
741static void migrate_page_add(struct page *page, struct list_head *pagelist,
742				unsigned long flags)
743{
744	/*
745	 * Avoid migrating a page that is shared with others.
746	 */
747	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
748		isolate_lru_page(page, pagelist);
749}
750
751static struct page *new_node_page(struct page *page, unsigned long node, int **x)
752{
753	return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
754}
755
756/*
757 * Migrate pages from one node to a target node.
758 * Returns error or the number of pages not migrated.
759 */
760static int migrate_to_node(struct mm_struct *mm, int source, int dest,
761			   int flags)
762{
763	nodemask_t nmask;
764	LIST_HEAD(pagelist);
765	int err = 0;
766
767	nodes_clear(nmask);
768	node_set(source, nmask);
769
770	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
771			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
772
773	if (!list_empty(&pagelist))
774		err = migrate_pages(&pagelist, new_node_page, dest);
775
776	return err;
777}
778
779/*
780 * Move pages between the two nodesets so as to preserve the physical
781 * layout as much as possible.
782 *
783 * Returns the number of page that could not be moved.
784 */
785int do_migrate_pages(struct mm_struct *mm,
786	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
787{
788	LIST_HEAD(pagelist);
789	int busy = 0;
790	int err = 0;
791	nodemask_t tmp;
792
793  	down_read(&mm->mmap_sem);
794
795	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
796	if (err)
797		goto out;
798
799/*
800 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
801 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
802 * bit in 'tmp', and return that <source, dest> pair for migration.
803 * The pair of nodemasks 'to' and 'from' define the map.
804 *
805 * If no pair of bits is found that way, fallback to picking some
806 * pair of 'source' and 'dest' bits that are not the same.  If the
807 * 'source' and 'dest' bits are the same, this represents a node
808 * that will be migrating to itself, so no pages need move.
809 *
810 * If no bits are left in 'tmp', or if all remaining bits left
811 * in 'tmp' correspond to the same bit in 'to', return false
812 * (nothing left to migrate).
813 *
814 * This lets us pick a pair of nodes to migrate between, such that
815 * if possible the dest node is not already occupied by some other
816 * source node, minimizing the risk of overloading the memory on a
817 * node that would happen if we migrated incoming memory to a node
818 * before migrating outgoing memory source that same node.
819 *
820 * A single scan of tmp is sufficient.  As we go, we remember the
821 * most recent <s, d> pair that moved (s != d).  If we find a pair
822 * that not only moved, but what's better, moved to an empty slot
823 * (d is not set in tmp), then we break out then, with that pair.
824 * Otherwise when we finish scannng from_tmp, we at least have the
825 * most recent <s, d> pair that moved.  If we get all the way through
826 * the scan of tmp without finding any node that moved, much less
827 * moved to an empty node, then there is nothing left worth migrating.
828 */
829
830	tmp = *from_nodes;
831	while (!nodes_empty(tmp)) {
832		int s,d;
833		int source = -1;
834		int dest = 0;
835
836		for_each_node_mask(s, tmp) {
837			d = node_remap(s, *from_nodes, *to_nodes);
838			if (s == d)
839				continue;
840
841			source = s;	/* Node moved. Memorize */
842			dest = d;
843
844			/* dest not in remaining from nodes? */
845			if (!node_isset(dest, tmp))
846				break;
847		}
848		if (source == -1)
849			break;
850
851		node_clear(source, tmp);
852		err = migrate_to_node(mm, source, dest, flags);
853		if (err > 0)
854			busy += err;
855		if (err < 0)
856			break;
857	}
858out:
859	up_read(&mm->mmap_sem);
860	if (err < 0)
861		return err;
862	return busy;
863
864}
865
866/*
867 * Allocate a new page for page migration based on vma policy.
868 * Start assuming that page is mapped by vma pointed to by @private.
869 * Search forward from there, if not.  N.B., this assumes that the
870 * list of pages handed to migrate_pages()--which is how we get here--
871 * is in virtual address order.
872 */
873static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
874{
875	struct vm_area_struct *vma = (struct vm_area_struct *)private;
876	unsigned long uninitialized_var(address);
877
878	while (vma) {
879		address = page_address_in_vma(page, vma);
880		if (address != -EFAULT)
881			break;
882		vma = vma->vm_next;
883	}
884
885	/*
886	 * if !vma, alloc_page_vma() will use task or system default policy
887	 */
888	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
889}
890#else
891
892static void migrate_page_add(struct page *page, struct list_head *pagelist,
893				unsigned long flags)
894{
895}
896
897int do_migrate_pages(struct mm_struct *mm,
898	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
899{
900	return -ENOSYS;
901}
902
903static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
904{
905	return NULL;
906}
907#endif
908
909static long do_mbind(unsigned long start, unsigned long len,
910		     unsigned short mode, unsigned short mode_flags,
911		     nodemask_t *nmask, unsigned long flags)
912{
913	struct vm_area_struct *vma;
914	struct mm_struct *mm = current->mm;
915	struct mempolicy *new;
916	unsigned long end;
917	int err;
918	LIST_HEAD(pagelist);
919
920	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
921				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
922		return -EINVAL;
923	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
924		return -EPERM;
925
926	if (start & ~PAGE_MASK)
927		return -EINVAL;
928
929	if (mode == MPOL_DEFAULT)
930		flags &= ~MPOL_MF_STRICT;
931
932	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
933	end = start + len;
934
935	if (end < start)
936		return -EINVAL;
937	if (end == start)
938		return 0;
939
940	new = mpol_new(mode, mode_flags, nmask);
941	if (IS_ERR(new))
942		return PTR_ERR(new);
943
944	/*
945	 * If we are using the default policy then operation
946	 * on discontinuous address spaces is okay after all
947	 */
948	if (!new)
949		flags |= MPOL_MF_DISCONTIG_OK;
950
951	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
952		 start, start + len, mode, mode_flags,
953		 nmask ? nodes_addr(*nmask)[0] : -1);
954
955	down_write(&mm->mmap_sem);
956	vma = check_range(mm, start, end, nmask,
957			  flags | MPOL_MF_INVERT, &pagelist);
958
959	err = PTR_ERR(vma);
960	if (!IS_ERR(vma)) {
961		int nr_failed = 0;
962
963		err = mbind_range(vma, start, end, new);
964
965		if (!list_empty(&pagelist))
966			nr_failed = migrate_pages(&pagelist, new_vma_page,
967						(unsigned long)vma);
968
969		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
970			err = -EIO;
971	}
972
973	up_write(&mm->mmap_sem);
974	mpol_put(new);
975	return err;
976}
977
978/*
979 * User space interface with variable sized bitmaps for nodelists.
980 */
981
982/* Copy a node mask from user space. */
983static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
984		     unsigned long maxnode)
985{
986	unsigned long k;
987	unsigned long nlongs;
988	unsigned long endmask;
989
990	--maxnode;
991	nodes_clear(*nodes);
992	if (maxnode == 0 || !nmask)
993		return 0;
994	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
995		return -EINVAL;
996
997	nlongs = BITS_TO_LONGS(maxnode);
998	if ((maxnode % BITS_PER_LONG) == 0)
999		endmask = ~0UL;
1000	else
1001		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1002
1003	/* When the user specified more nodes than supported just check
1004	   if the non supported part is all zero. */
1005	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1006		if (nlongs > PAGE_SIZE/sizeof(long))
1007			return -EINVAL;
1008		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1009			unsigned long t;
1010			if (get_user(t, nmask + k))
1011				return -EFAULT;
1012			if (k == nlongs - 1) {
1013				if (t & endmask)
1014					return -EINVAL;
1015			} else if (t)
1016				return -EINVAL;
1017		}
1018		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1019		endmask = ~0UL;
1020	}
1021
1022	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1023		return -EFAULT;
1024	nodes_addr(*nodes)[nlongs-1] &= endmask;
1025	return 0;
1026}
1027
1028/* Copy a kernel node mask to user space */
1029static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1030			      nodemask_t *nodes)
1031{
1032	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1033	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1034
1035	if (copy > nbytes) {
1036		if (copy > PAGE_SIZE)
1037			return -EINVAL;
1038		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1039			return -EFAULT;
1040		copy = nbytes;
1041	}
1042	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1043}
1044
1045asmlinkage long sys_mbind(unsigned long start, unsigned long len,
1046			unsigned long mode,
1047			unsigned long __user *nmask, unsigned long maxnode,
1048			unsigned flags)
1049{
1050	nodemask_t nodes;
1051	int err;
1052	unsigned short mode_flags;
1053
1054	mode_flags = mode & MPOL_MODE_FLAGS;
1055	mode &= ~MPOL_MODE_FLAGS;
1056	if (mode >= MPOL_MAX)
1057		return -EINVAL;
1058	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1059	    (mode_flags & MPOL_F_RELATIVE_NODES))
1060		return -EINVAL;
1061	err = get_nodes(&nodes, nmask, maxnode);
1062	if (err)
1063		return err;
1064	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1065}
1066
1067/* Set the process memory policy */
1068asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
1069		unsigned long maxnode)
1070{
1071	int err;
1072	nodemask_t nodes;
1073	unsigned short flags;
1074
1075	flags = mode & MPOL_MODE_FLAGS;
1076	mode &= ~MPOL_MODE_FLAGS;
1077	if ((unsigned int)mode >= MPOL_MAX)
1078		return -EINVAL;
1079	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1080		return -EINVAL;
1081	err = get_nodes(&nodes, nmask, maxnode);
1082	if (err)
1083		return err;
1084	return do_set_mempolicy(mode, flags, &nodes);
1085}
1086
1087asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
1088		const unsigned long __user *old_nodes,
1089		const unsigned long __user *new_nodes)
1090{
1091	struct mm_struct *mm;
1092	struct task_struct *task;
1093	nodemask_t old;
1094	nodemask_t new;
1095	nodemask_t task_nodes;
1096	int err;
1097
1098	err = get_nodes(&old, old_nodes, maxnode);
1099	if (err)
1100		return err;
1101
1102	err = get_nodes(&new, new_nodes, maxnode);
1103	if (err)
1104		return err;
1105
1106	/* Find the mm_struct */
1107	read_lock(&tasklist_lock);
1108	task = pid ? find_task_by_vpid(pid) : current;
1109	if (!task) {
1110		read_unlock(&tasklist_lock);
1111		return -ESRCH;
1112	}
1113	mm = get_task_mm(task);
1114	read_unlock(&tasklist_lock);
1115
1116	if (!mm)
1117		return -EINVAL;
1118
1119	/*
1120	 * Check if this process has the right to modify the specified
1121	 * process. The right exists if the process has administrative
1122	 * capabilities, superuser privileges or the same
1123	 * userid as the target process.
1124	 */
1125	if ((current->euid != task->suid) && (current->euid != task->uid) &&
1126	    (current->uid != task->suid) && (current->uid != task->uid) &&
1127	    !capable(CAP_SYS_NICE)) {
1128		err = -EPERM;
1129		goto out;
1130	}
1131
1132	task_nodes = cpuset_mems_allowed(task);
1133	/* Is the user allowed to access the target nodes? */
1134	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1135		err = -EPERM;
1136		goto out;
1137	}
1138
1139	if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1140		err = -EINVAL;
1141		goto out;
1142	}
1143
1144	err = security_task_movememory(task);
1145	if (err)
1146		goto out;
1147
1148	err = do_migrate_pages(mm, &old, &new,
1149		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1150out:
1151	mmput(mm);
1152	return err;
1153}
1154
1155
1156/* Retrieve NUMA policy */
1157asmlinkage long sys_get_mempolicy(int __user *policy,
1158				unsigned long __user *nmask,
1159				unsigned long maxnode,
1160				unsigned long addr, unsigned long flags)
1161{
1162	int err;
1163	int uninitialized_var(pval);
1164	nodemask_t nodes;
1165
1166	if (nmask != NULL && maxnode < MAX_NUMNODES)
1167		return -EINVAL;
1168
1169	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1170
1171	if (err)
1172		return err;
1173
1174	if (policy && put_user(pval, policy))
1175		return -EFAULT;
1176
1177	if (nmask)
1178		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1179
1180	return err;
1181}
1182
1183#ifdef CONFIG_COMPAT
1184
1185asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1186				     compat_ulong_t __user *nmask,
1187				     compat_ulong_t maxnode,
1188				     compat_ulong_t addr, compat_ulong_t flags)
1189{
1190	long err;
1191	unsigned long __user *nm = NULL;
1192	unsigned long nr_bits, alloc_size;
1193	DECLARE_BITMAP(bm, MAX_NUMNODES);
1194
1195	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1196	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1197
1198	if (nmask)
1199		nm = compat_alloc_user_space(alloc_size);
1200
1201	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1202
1203	if (!err && nmask) {
1204		err = copy_from_user(bm, nm, alloc_size);
1205		/* ensure entire bitmap is zeroed */
1206		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1207		err |= compat_put_bitmap(nmask, bm, nr_bits);
1208	}
1209
1210	return err;
1211}
1212
1213asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1214				     compat_ulong_t maxnode)
1215{
1216	long err = 0;
1217	unsigned long __user *nm = NULL;
1218	unsigned long nr_bits, alloc_size;
1219	DECLARE_BITMAP(bm, MAX_NUMNODES);
1220
1221	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1222	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1223
1224	if (nmask) {
1225		err = compat_get_bitmap(bm, nmask, nr_bits);
1226		nm = compat_alloc_user_space(alloc_size);
1227		err |= copy_to_user(nm, bm, alloc_size);
1228	}
1229
1230	if (err)
1231		return -EFAULT;
1232
1233	return sys_set_mempolicy(mode, nm, nr_bits+1);
1234}
1235
1236asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1237			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1238			     compat_ulong_t maxnode, compat_ulong_t flags)
1239{
1240	long err = 0;
1241	unsigned long __user *nm = NULL;
1242	unsigned long nr_bits, alloc_size;
1243	nodemask_t bm;
1244
1245	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1246	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1247
1248	if (nmask) {
1249		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1250		nm = compat_alloc_user_space(alloc_size);
1251		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1252	}
1253
1254	if (err)
1255		return -EFAULT;
1256
1257	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1258}
1259
1260#endif
1261
1262/*
1263 * get_vma_policy(@task, @vma, @addr)
1264 * @task - task for fallback if vma policy == default
1265 * @vma   - virtual memory area whose policy is sought
1266 * @addr  - address in @vma for shared policy lookup
1267 *
1268 * Returns effective policy for a VMA at specified address.
1269 * Falls back to @task or system default policy, as necessary.
1270 * Current or other task's task mempolicy and non-shared vma policies
1271 * are protected by the task's mmap_sem, which must be held for read by
1272 * the caller.
1273 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1274 * count--added by the get_policy() vm_op, as appropriate--to protect against
1275 * freeing by another task.  It is the caller's responsibility to free the
1276 * extra reference for shared policies.
1277 */
1278static struct mempolicy *get_vma_policy(struct task_struct *task,
1279		struct vm_area_struct *vma, unsigned long addr)
1280{
1281	struct mempolicy *pol = task->mempolicy;
1282
1283	if (vma) {
1284		if (vma->vm_ops && vma->vm_ops->get_policy) {
1285			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1286									addr);
1287			if (vpol)
1288				pol = vpol;
1289		} else if (vma->vm_policy &&
1290				vma->vm_policy->mode != MPOL_DEFAULT)
1291			pol = vma->vm_policy;
1292	}
1293	if (!pol)
1294		pol = &default_policy;
1295	return pol;
1296}
1297
1298/*
1299 * Return a nodemask representing a mempolicy for filtering nodes for
1300 * page allocation
1301 */
1302static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1303{
1304	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1305	if (unlikely(policy->mode == MPOL_BIND) &&
1306			gfp_zone(gfp) >= policy_zone &&
1307			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1308		return &policy->v.nodes;
1309
1310	return NULL;
1311}
1312
1313/* Return a zonelist indicated by gfp for node representing a mempolicy */
1314static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1315{
1316	int nd;
1317
1318	switch (policy->mode) {
1319	case MPOL_PREFERRED:
1320		nd = policy->v.preferred_node;
1321		if (nd < 0)
1322			nd = numa_node_id();
1323		break;
1324	case MPOL_BIND:
1325		/*
1326		 * Normally, MPOL_BIND allocations are node-local within the
1327		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1328		 * current node is part of the mask, we use the zonelist for
1329		 * the first node in the mask instead.
1330		 */
1331		nd = numa_node_id();
1332		if (unlikely(gfp & __GFP_THISNODE) &&
1333				unlikely(!node_isset(nd, policy->v.nodes)))
1334			nd = first_node(policy->v.nodes);
1335		break;
1336	case MPOL_INTERLEAVE: /* should not happen */
1337	case MPOL_DEFAULT:
1338		nd = numa_node_id();
1339		break;
1340	default:
1341		nd = 0;
1342		BUG();
1343	}
1344	return node_zonelist(nd, gfp);
1345}
1346
1347/* Do dynamic interleaving for a process */
1348static unsigned interleave_nodes(struct mempolicy *policy)
1349{
1350	unsigned nid, next;
1351	struct task_struct *me = current;
1352
1353	nid = me->il_next;
1354	next = next_node(nid, policy->v.nodes);
1355	if (next >= MAX_NUMNODES)
1356		next = first_node(policy->v.nodes);
1357	if (next < MAX_NUMNODES)
1358		me->il_next = next;
1359	return nid;
1360}
1361
1362/*
1363 * Depending on the memory policy provide a node from which to allocate the
1364 * next slab entry.
1365 * @policy must be protected by freeing by the caller.  If @policy is
1366 * the current task's mempolicy, this protection is implicit, as only the
1367 * task can change it's policy.  The system default policy requires no
1368 * such protection.
1369 */
1370unsigned slab_node(struct mempolicy *policy)
1371{
1372	unsigned short pol = policy ? policy->mode : MPOL_DEFAULT;
1373
1374	switch (pol) {
1375	case MPOL_INTERLEAVE:
1376		return interleave_nodes(policy);
1377
1378	case MPOL_BIND: {
1379		/*
1380		 * Follow bind policy behavior and start allocation at the
1381		 * first node.
1382		 */
1383		struct zonelist *zonelist;
1384		struct zone *zone;
1385		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1386		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1387		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1388							&policy->v.nodes,
1389							&zone);
1390		return zone->node;
1391	}
1392
1393	case MPOL_PREFERRED:
1394		if (policy->v.preferred_node >= 0)
1395			return policy->v.preferred_node;
1396		/* Fall through */
1397
1398	default:
1399		return numa_node_id();
1400	}
1401}
1402
1403/* Do static interleaving for a VMA with known offset. */
1404static unsigned offset_il_node(struct mempolicy *pol,
1405		struct vm_area_struct *vma, unsigned long off)
1406{
1407	unsigned nnodes = nodes_weight(pol->v.nodes);
1408	unsigned target;
1409	int c;
1410	int nid = -1;
1411
1412	if (!nnodes)
1413		return numa_node_id();
1414	target = (unsigned int)off % nnodes;
1415	c = 0;
1416	do {
1417		nid = next_node(nid, pol->v.nodes);
1418		c++;
1419	} while (c <= target);
1420	return nid;
1421}
1422
1423/* Determine a node number for interleave */
1424static inline unsigned interleave_nid(struct mempolicy *pol,
1425		 struct vm_area_struct *vma, unsigned long addr, int shift)
1426{
1427	if (vma) {
1428		unsigned long off;
1429
1430		/*
1431		 * for small pages, there is no difference between
1432		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1433		 * for huge pages, since vm_pgoff is in units of small
1434		 * pages, we need to shift off the always 0 bits to get
1435		 * a useful offset.
1436		 */
1437		BUG_ON(shift < PAGE_SHIFT);
1438		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1439		off += (addr - vma->vm_start) >> shift;
1440		return offset_il_node(pol, vma, off);
1441	} else
1442		return interleave_nodes(pol);
1443}
1444
1445#ifdef CONFIG_HUGETLBFS
1446/*
1447 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1448 * @vma = virtual memory area whose policy is sought
1449 * @addr = address in @vma for shared policy lookup and interleave policy
1450 * @gfp_flags = for requested zone
1451 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1452 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1453 *
1454 * Returns a zonelist suitable for a huge page allocation and a pointer
1455 * to the struct mempolicy for conditional unref after allocation.
1456 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1457 * @nodemask for filtering the zonelist.
1458 */
1459struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1460				gfp_t gfp_flags, struct mempolicy **mpol,
1461				nodemask_t **nodemask)
1462{
1463	struct zonelist *zl;
1464
1465	*mpol = get_vma_policy(current, vma, addr);
1466	*nodemask = NULL;	/* assume !MPOL_BIND */
1467
1468	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1469		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1470						HPAGE_SHIFT), gfp_flags);
1471	} else {
1472		zl = policy_zonelist(gfp_flags, *mpol);
1473		if ((*mpol)->mode == MPOL_BIND)
1474			*nodemask = &(*mpol)->v.nodes;
1475	}
1476	return zl;
1477}
1478#endif
1479
1480/* Allocate a page in interleaved policy.
1481   Own path because it needs to do special accounting. */
1482static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1483					unsigned nid)
1484{
1485	struct zonelist *zl;
1486	struct page *page;
1487
1488	zl = node_zonelist(nid, gfp);
1489	page = __alloc_pages(gfp, order, zl);
1490	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1491		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1492	return page;
1493}
1494
1495/**
1496 * 	alloc_page_vma	- Allocate a page for a VMA.
1497 *
1498 * 	@gfp:
1499 *      %GFP_USER    user allocation.
1500 *      %GFP_KERNEL  kernel allocations,
1501 *      %GFP_HIGHMEM highmem/user allocations,
1502 *      %GFP_FS      allocation should not call back into a file system.
1503 *      %GFP_ATOMIC  don't sleep.
1504 *
1505 * 	@vma:  Pointer to VMA or NULL if not available.
1506 *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1507 *
1508 * 	This function allocates a page from the kernel page pool and applies
1509 *	a NUMA policy associated with the VMA or the current process.
1510 *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1511 *	mm_struct of the VMA to prevent it from going away. Should be used for
1512 *	all allocations for pages that will be mapped into
1513 * 	user space. Returns NULL when no page can be allocated.
1514 *
1515 *	Should be called with the mm_sem of the vma hold.
1516 */
1517struct page *
1518alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1519{
1520	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1521	struct zonelist *zl;
1522
1523	cpuset_update_task_memory_state();
1524
1525	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1526		unsigned nid;
1527
1528		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1529		mpol_cond_put(pol);
1530		return alloc_page_interleave(gfp, 0, nid);
1531	}
1532	zl = policy_zonelist(gfp, pol);
1533	if (unlikely(mpol_needs_cond_ref(pol))) {
1534		/*
1535		 * slow path: ref counted shared policy
1536		 */
1537		struct page *page =  __alloc_pages_nodemask(gfp, 0,
1538						zl, policy_nodemask(gfp, pol));
1539		__mpol_put(pol);
1540		return page;
1541	}
1542	/*
1543	 * fast path:  default or task policy
1544	 */
1545	return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1546}
1547
1548/**
1549 * 	alloc_pages_current - Allocate pages.
1550 *
1551 *	@gfp:
1552 *		%GFP_USER   user allocation,
1553 *      	%GFP_KERNEL kernel allocation,
1554 *      	%GFP_HIGHMEM highmem allocation,
1555 *      	%GFP_FS     don't call back into a file system.
1556 *      	%GFP_ATOMIC don't sleep.
1557 *	@order: Power of two of allocation size in pages. 0 is a single page.
1558 *
1559 *	Allocate a page from the kernel page pool.  When not in
1560 *	interrupt context and apply the current process NUMA policy.
1561 *	Returns NULL when no page can be allocated.
1562 *
1563 *	Don't call cpuset_update_task_memory_state() unless
1564 *	1) it's ok to take cpuset_sem (can WAIT), and
1565 *	2) allocating for current task (not interrupt).
1566 */
1567struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1568{
1569	struct mempolicy *pol = current->mempolicy;
1570
1571	if ((gfp & __GFP_WAIT) && !in_interrupt())
1572		cpuset_update_task_memory_state();
1573	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1574		pol = &default_policy;
1575
1576	/*
1577	 * No reference counting needed for current->mempolicy
1578	 * nor system default_policy
1579	 */
1580	if (pol->mode == MPOL_INTERLEAVE)
1581		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1582	return __alloc_pages_nodemask(gfp, order,
1583			policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1584}
1585EXPORT_SYMBOL(alloc_pages_current);
1586
1587/*
1588 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1589 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1590 * with the mems_allowed returned by cpuset_mems_allowed().  This
1591 * keeps mempolicies cpuset relative after its cpuset moves.  See
1592 * further kernel/cpuset.c update_nodemask().
1593 */
1594
1595/* Slow path of a mempolicy duplicate */
1596struct mempolicy *__mpol_dup(struct mempolicy *old)
1597{
1598	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1599
1600	if (!new)
1601		return ERR_PTR(-ENOMEM);
1602	if (current_cpuset_is_being_rebound()) {
1603		nodemask_t mems = cpuset_mems_allowed(current);
1604		mpol_rebind_policy(old, &mems);
1605	}
1606	*new = *old;
1607	atomic_set(&new->refcnt, 1);
1608	return new;
1609}
1610
1611/*
1612 * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1613 * eliminate the * MPOL_F_* flags that require conditional ref and
1614 * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1615 * after return.  Use the returned value.
1616 *
1617 * Allows use of a mempolicy for, e.g., multiple allocations with a single
1618 * policy lookup, even if the policy needs/has extra ref on lookup.
1619 * shmem_readahead needs this.
1620 */
1621struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1622						struct mempolicy *frompol)
1623{
1624	if (!mpol_needs_cond_ref(frompol))
1625		return frompol;
1626
1627	*tompol = *frompol;
1628	tompol->flags &= ~MPOL_F_SHARED;	/* copy doesn't need unref */
1629	__mpol_put(frompol);
1630	return tompol;
1631}
1632
1633static int mpol_match_intent(const struct mempolicy *a,
1634			     const struct mempolicy *b)
1635{
1636	if (a->flags != b->flags)
1637		return 0;
1638	if (!mpol_store_user_nodemask(a))
1639		return 1;
1640	return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1641}
1642
1643/* Slow path of a mempolicy comparison */
1644int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1645{
1646	if (!a || !b)
1647		return 0;
1648	if (a->mode != b->mode)
1649		return 0;
1650	if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1651		return 0;
1652	switch (a->mode) {
1653	case MPOL_DEFAULT:
1654		return 1;
1655	case MPOL_BIND:
1656		/* Fall through */
1657	case MPOL_INTERLEAVE:
1658		return nodes_equal(a->v.nodes, b->v.nodes);
1659	case MPOL_PREFERRED:
1660		return a->v.preferred_node == b->v.preferred_node;
1661	default:
1662		BUG();
1663		return 0;
1664	}
1665}
1666
1667/*
1668 * Shared memory backing store policy support.
1669 *
1670 * Remember policies even when nobody has shared memory mapped.
1671 * The policies are kept in Red-Black tree linked from the inode.
1672 * They are protected by the sp->lock spinlock, which should be held
1673 * for any accesses to the tree.
1674 */
1675
1676/* lookup first element intersecting start-end */
1677/* Caller holds sp->lock */
1678static struct sp_node *
1679sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1680{
1681	struct rb_node *n = sp->root.rb_node;
1682
1683	while (n) {
1684		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1685
1686		if (start >= p->end)
1687			n = n->rb_right;
1688		else if (end <= p->start)
1689			n = n->rb_left;
1690		else
1691			break;
1692	}
1693	if (!n)
1694		return NULL;
1695	for (;;) {
1696		struct sp_node *w = NULL;
1697		struct rb_node *prev = rb_prev(n);
1698		if (!prev)
1699			break;
1700		w = rb_entry(prev, struct sp_node, nd);
1701		if (w->end <= start)
1702			break;
1703		n = prev;
1704	}
1705	return rb_entry(n, struct sp_node, nd);
1706}
1707
1708/* Insert a new shared policy into the list. */
1709/* Caller holds sp->lock */
1710static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1711{
1712	struct rb_node **p = &sp->root.rb_node;
1713	struct rb_node *parent = NULL;
1714	struct sp_node *nd;
1715
1716	while (*p) {
1717		parent = *p;
1718		nd = rb_entry(parent, struct sp_node, nd);
1719		if (new->start < nd->start)
1720			p = &(*p)->rb_left;
1721		else if (new->end > nd->end)
1722			p = &(*p)->rb_right;
1723		else
1724			BUG();
1725	}
1726	rb_link_node(&new->nd, parent, p);
1727	rb_insert_color(&new->nd, &sp->root);
1728	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1729		 new->policy ? new->policy->mode : 0);
1730}
1731
1732/* Find shared policy intersecting idx */
1733struct mempolicy *
1734mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1735{
1736	struct mempolicy *pol = NULL;
1737	struct sp_node *sn;
1738
1739	if (!sp->root.rb_node)
1740		return NULL;
1741	spin_lock(&sp->lock);
1742	sn = sp_lookup(sp, idx, idx+1);
1743	if (sn) {
1744		mpol_get(sn->policy);
1745		pol = sn->policy;
1746	}
1747	spin_unlock(&sp->lock);
1748	return pol;
1749}
1750
1751static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1752{
1753	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1754	rb_erase(&n->nd, &sp->root);
1755	mpol_put(n->policy);
1756	kmem_cache_free(sn_cache, n);
1757}
1758
1759static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1760				struct mempolicy *pol)
1761{
1762	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1763
1764	if (!n)
1765		return NULL;
1766	n->start = start;
1767	n->end = end;
1768	mpol_get(pol);
1769	pol->flags |= MPOL_F_SHARED;	/* for unref */
1770	n->policy = pol;
1771	return n;
1772}
1773
1774/* Replace a policy range. */
1775static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1776				 unsigned long end, struct sp_node *new)
1777{
1778	struct sp_node *n, *new2 = NULL;
1779
1780restart:
1781	spin_lock(&sp->lock);
1782	n = sp_lookup(sp, start, end);
1783	/* Take care of old policies in the same range. */
1784	while (n && n->start < end) {
1785		struct rb_node *next = rb_next(&n->nd);
1786		if (n->start >= start) {
1787			if (n->end <= end)
1788				sp_delete(sp, n);
1789			else
1790				n->start = end;
1791		} else {
1792			/* Old policy spanning whole new range. */
1793			if (n->end > end) {
1794				if (!new2) {
1795					spin_unlock(&sp->lock);
1796					new2 = sp_alloc(end, n->end, n->policy);
1797					if (!new2)
1798						return -ENOMEM;
1799					goto restart;
1800				}
1801				n->end = start;
1802				sp_insert(sp, new2);
1803				new2 = NULL;
1804				break;
1805			} else
1806				n->end = start;
1807		}
1808		if (!next)
1809			break;
1810		n = rb_entry(next, struct sp_node, nd);
1811	}
1812	if (new)
1813		sp_insert(sp, new);
1814	spin_unlock(&sp->lock);
1815	if (new2) {
1816		mpol_put(new2->policy);
1817		kmem_cache_free(sn_cache, new2);
1818	}
1819	return 0;
1820}
1821
1822void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
1823			unsigned short flags, nodemask_t *policy_nodes)
1824{
1825	info->root = RB_ROOT;
1826	spin_lock_init(&info->lock);
1827
1828	if (policy != MPOL_DEFAULT) {
1829		struct mempolicy *newpol;
1830
1831		/* Falls back to MPOL_DEFAULT on any error */
1832		newpol = mpol_new(policy, flags, policy_nodes);
1833		if (!IS_ERR(newpol)) {
1834			/* Create pseudo-vma that contains just the policy */
1835			struct vm_area_struct pvma;
1836
1837			memset(&pvma, 0, sizeof(struct vm_area_struct));
1838			/* Policy covers entire file */
1839			pvma.vm_end = TASK_SIZE;
1840			mpol_set_shared_policy(info, &pvma, newpol);
1841			mpol_put(newpol);
1842		}
1843	}
1844}
1845
1846int mpol_set_shared_policy(struct shared_policy *info,
1847			struct vm_area_struct *vma, struct mempolicy *npol)
1848{
1849	int err;
1850	struct sp_node *new = NULL;
1851	unsigned long sz = vma_pages(vma);
1852
1853	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1854		 vma->vm_pgoff,
1855		 sz, npol ? npol->mode : -1,
1856		 npol ? npol->flags : -1,
1857		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1858
1859	if (npol) {
1860		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1861		if (!new)
1862			return -ENOMEM;
1863	}
1864	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1865	if (err && new)
1866		kmem_cache_free(sn_cache, new);
1867	return err;
1868}
1869
1870/* Free a backing policy store on inode delete. */
1871void mpol_free_shared_policy(struct shared_policy *p)
1872{
1873	struct sp_node *n;
1874	struct rb_node *next;
1875
1876	if (!p->root.rb_node)
1877		return;
1878	spin_lock(&p->lock);
1879	next = rb_first(&p->root);
1880	while (next) {
1881		n = rb_entry(next, struct sp_node, nd);
1882		next = rb_next(&n->nd);
1883		rb_erase(&n->nd, &p->root);
1884		mpol_put(n->policy);
1885		kmem_cache_free(sn_cache, n);
1886	}
1887	spin_unlock(&p->lock);
1888}
1889
1890/* assumes fs == KERNEL_DS */
1891void __init numa_policy_init(void)
1892{
1893	nodemask_t interleave_nodes;
1894	unsigned long largest = 0;
1895	int nid, prefer = 0;
1896
1897	policy_cache = kmem_cache_create("numa_policy",
1898					 sizeof(struct mempolicy),
1899					 0, SLAB_PANIC, NULL);
1900
1901	sn_cache = kmem_cache_create("shared_policy_node",
1902				     sizeof(struct sp_node),
1903				     0, SLAB_PANIC, NULL);
1904
1905	/*
1906	 * Set interleaving policy for system init. Interleaving is only
1907	 * enabled across suitably sized nodes (default is >= 16MB), or
1908	 * fall back to the largest node if they're all smaller.
1909	 */
1910	nodes_clear(interleave_nodes);
1911	for_each_node_state(nid, N_HIGH_MEMORY) {
1912		unsigned long total_pages = node_present_pages(nid);
1913
1914		/* Preserve the largest node */
1915		if (largest < total_pages) {
1916			largest = total_pages;
1917			prefer = nid;
1918		}
1919
1920		/* Interleave this node? */
1921		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1922			node_set(nid, interleave_nodes);
1923	}
1924
1925	/* All too small, use the largest */
1926	if (unlikely(nodes_empty(interleave_nodes)))
1927		node_set(prefer, interleave_nodes);
1928
1929	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1930		printk("numa_policy_init: interleaving failed\n");
1931}
1932
1933/* Reset policy of current process to default */
1934void numa_default_policy(void)
1935{
1936	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1937}
1938
1939/*
1940 * Display pages allocated per node and memory policy via /proc.
1941 */
1942static const char * const policy_types[] =
1943	{ "default", "prefer", "bind", "interleave" };
1944
1945/*
1946 * Convert a mempolicy into a string.
1947 * Returns the number of characters in buffer (if positive)
1948 * or an error (negative)
1949 */
1950static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1951{
1952	char *p = buffer;
1953	int l;
1954	nodemask_t nodes;
1955	unsigned short mode = pol ? pol->mode : MPOL_DEFAULT;
1956	unsigned short flags = pol ? pol->flags : 0;
1957
1958	switch (mode) {
1959	case MPOL_DEFAULT:
1960		nodes_clear(nodes);
1961		break;
1962
1963	case MPOL_PREFERRED:
1964		nodes_clear(nodes);
1965		node_set(pol->v.preferred_node, nodes);
1966		break;
1967
1968	case MPOL_BIND:
1969		/* Fall through */
1970	case MPOL_INTERLEAVE:
1971		nodes = pol->v.nodes;
1972		break;
1973
1974	default:
1975		BUG();
1976		return -EFAULT;
1977	}
1978
1979	l = strlen(policy_types[mode]);
1980 	if (buffer + maxlen < p + l + 1)
1981 		return -ENOSPC;
1982
1983	strcpy(p, policy_types[mode]);
1984	p += l;
1985
1986	if (flags) {
1987		int need_bar = 0;
1988
1989		if (buffer + maxlen < p + 2)
1990			return -ENOSPC;
1991		*p++ = '=';
1992
1993		if (flags & MPOL_F_STATIC_NODES)
1994			p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
1995		if (flags & MPOL_F_RELATIVE_NODES)
1996			p += sprintf(p, "%srelative", need_bar++ ? "|" : "");
1997	}
1998
1999	if (!nodes_empty(nodes)) {
2000		if (buffer + maxlen < p + 2)
2001			return -ENOSPC;
2002		*p++ = '=';
2003	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2004	}
2005	return p - buffer;
2006}
2007
2008struct numa_maps {
2009	unsigned long pages;
2010	unsigned long anon;
2011	unsigned long active;
2012	unsigned long writeback;
2013	unsigned long mapcount_max;
2014	unsigned long dirty;
2015	unsigned long swapcache;
2016	unsigned long node[MAX_NUMNODES];
2017};
2018
2019static void gather_stats(struct page *page, void *private, int pte_dirty)
2020{
2021	struct numa_maps *md = private;
2022	int count = page_mapcount(page);
2023
2024	md->pages++;
2025	if (pte_dirty || PageDirty(page))
2026		md->dirty++;
2027
2028	if (PageSwapCache(page))
2029		md->swapcache++;
2030
2031	if (PageActive(page))
2032		md->active++;
2033
2034	if (PageWriteback(page))
2035		md->writeback++;
2036
2037	if (PageAnon(page))
2038		md->anon++;
2039
2040	if (count > md->mapcount_max)
2041		md->mapcount_max = count;
2042
2043	md->node[page_to_nid(page)]++;
2044}
2045
2046#ifdef CONFIG_HUGETLB_PAGE
2047static void check_huge_range(struct vm_area_struct *vma,
2048		unsigned long start, unsigned long end,
2049		struct numa_maps *md)
2050{
2051	unsigned long addr;
2052	struct page *page;
2053
2054	for (addr = start; addr < end; addr += HPAGE_SIZE) {
2055		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
2056		pte_t pte;
2057
2058		if (!ptep)
2059			continue;
2060
2061		pte = *ptep;
2062		if (pte_none(pte))
2063			continue;
2064
2065		page = pte_page(pte);
2066		if (!page)
2067			continue;
2068
2069		gather_stats(page, md, pte_dirty(*ptep));
2070	}
2071}
2072#else
2073static inline void check_huge_range(struct vm_area_struct *vma,
2074		unsigned long start, unsigned long end,
2075		struct numa_maps *md)
2076{
2077}
2078#endif
2079
2080int show_numa_map(struct seq_file *m, void *v)
2081{
2082	struct proc_maps_private *priv = m->private;
2083	struct vm_area_struct *vma = v;
2084	struct numa_maps *md;
2085	struct file *file = vma->vm_file;
2086	struct mm_struct *mm = vma->vm_mm;
2087	struct mempolicy *pol;
2088	int n;
2089	char buffer[50];
2090
2091	if (!mm)
2092		return 0;
2093
2094	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2095	if (!md)
2096		return 0;
2097
2098	pol = get_vma_policy(priv->task, vma, vma->vm_start);
2099	mpol_to_str(buffer, sizeof(buffer), pol);
2100	mpol_cond_put(pol);
2101
2102	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2103
2104	if (file) {
2105		seq_printf(m, " file=");
2106		seq_path(m, &file->f_path, "\n\t= ");
2107	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2108		seq_printf(m, " heap");
2109	} else if (vma->vm_start <= mm->start_stack &&
2110			vma->vm_end >= mm->start_stack) {
2111		seq_printf(m, " stack");
2112	}
2113
2114	if (is_vm_hugetlb_page(vma)) {
2115		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2116		seq_printf(m, " huge");
2117	} else {
2118		check_pgd_range(vma, vma->vm_start, vma->vm_end,
2119			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2120	}
2121
2122	if (!md->pages)
2123		goto out;
2124
2125	if (md->anon)
2126		seq_printf(m," anon=%lu",md->anon);
2127
2128	if (md->dirty)
2129		seq_printf(m," dirty=%lu",md->dirty);
2130
2131	if (md->pages != md->anon && md->pages != md->dirty)
2132		seq_printf(m, " mapped=%lu", md->pages);
2133
2134	if (md->mapcount_max > 1)
2135		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2136
2137	if (md->swapcache)
2138		seq_printf(m," swapcache=%lu", md->swapcache);
2139
2140	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2141		seq_printf(m," active=%lu", md->active);
2142
2143	if (md->writeback)
2144		seq_printf(m," writeback=%lu", md->writeback);
2145
2146	for_each_node_state(n, N_HIGH_MEMORY)
2147		if (md->node[n])
2148			seq_printf(m, " N%d=%lu", n, md->node[n]);
2149out:
2150	seq_putc(m, '\n');
2151	kfree(md);
2152
2153	if (m->count < m->size)
2154		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2155	return 0;
2156}
2157