mempolicy.c revision bea904d54d6faa92400f10c8ea3d3828b8e1eb93
1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave     Allocate memory interleaved over a set of nodes,
16 *                with normal fallback if it fails.
17 *                For VMA based allocations this interleaves based on the
18 *                offset into the backing object or offset into the mapping
19 *                for anonymous memory. For process policy an process counter
20 *                is used.
21 *
22 * bind           Only allocate memory on a specific set of nodes,
23 *                no fallback.
24 *                FIXME: memory is allocated starting with the first node
25 *                to the last. It would be better if bind would truly restrict
26 *                the allocation to memory nodes instead
27 *
28 * preferred       Try a specific node first before normal fallback.
29 *                As a special case node -1 here means do the allocation
30 *                on the local CPU. This is normally identical to default,
31 *                but useful to set in a VMA when you have a non default
32 *                process policy.
33 *
34 * default        Allocate on the local node first, or when on a VMA
35 *                use the process policy. This is what Linux always did
36 *		  in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57   fix mmap readahead to honour policy and enable policy for any page cache
58   object
59   statistics for bigpages
60   global policy for page cache? currently it uses process policy. Requires
61   first item above.
62   handle mremap for shared memory (currently ignored for the policy)
63   grows down?
64   make bind policy root only? It can trigger oom much faster and the
65   kernel is not always grateful with that.
66*/
67
68#include <linux/mempolicy.h>
69#include <linux/mm.h>
70#include <linux/highmem.h>
71#include <linux/hugetlb.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
74#include <linux/nodemask.h>
75#include <linux/cpuset.h>
76#include <linux/gfp.h>
77#include <linux/slab.h>
78#include <linux/string.h>
79#include <linux/module.h>
80#include <linux/nsproxy.h>
81#include <linux/interrupt.h>
82#include <linux/init.h>
83#include <linux/compat.h>
84#include <linux/swap.h>
85#include <linux/seq_file.h>
86#include <linux/proc_fs.h>
87#include <linux/migrate.h>
88#include <linux/rmap.h>
89#include <linux/security.h>
90#include <linux/syscalls.h>
91
92#include <asm/tlbflush.h>
93#include <asm/uaccess.h>
94
95/* Internal flags */
96#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
97#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
98#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
99
100static struct kmem_cache *policy_cache;
101static struct kmem_cache *sn_cache;
102
103/* Highest zone. An specific allocation for a zone below that is not
104   policied. */
105enum zone_type policy_zone = 0;
106
107/*
108 * run-time system-wide default policy => local allocation
109 */
110struct mempolicy default_policy = {
111	.refcnt = ATOMIC_INIT(1), /* never free it */
112	.mode = MPOL_PREFERRED,
113	.v =  { .preferred_node =  -1 },
114};
115
116static const struct mempolicy_operations {
117	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
118	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
119} mpol_ops[MPOL_MAX];
120
121/* Check that the nodemask contains at least one populated zone */
122static int is_valid_nodemask(const nodemask_t *nodemask)
123{
124	int nd, k;
125
126	/* Check that there is something useful in this mask */
127	k = policy_zone;
128
129	for_each_node_mask(nd, *nodemask) {
130		struct zone *z;
131
132		for (k = 0; k <= policy_zone; k++) {
133			z = &NODE_DATA(nd)->node_zones[k];
134			if (z->present_pages > 0)
135				return 1;
136		}
137	}
138
139	return 0;
140}
141
142static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
143{
144	return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
145}
146
147static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
148				   const nodemask_t *rel)
149{
150	nodemask_t tmp;
151	nodes_fold(tmp, *orig, nodes_weight(*rel));
152	nodes_onto(*ret, tmp, *rel);
153}
154
155static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
156{
157	if (nodes_empty(*nodes))
158		return -EINVAL;
159	pol->v.nodes = *nodes;
160	return 0;
161}
162
163static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
164{
165	if (!nodes)
166		pol->v.preferred_node = -1;	/* local allocation */
167	else if (nodes_empty(*nodes))
168		return -EINVAL;			/*  no allowed nodes */
169	else
170		pol->v.preferred_node = first_node(*nodes);
171	return 0;
172}
173
174static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
175{
176	if (!is_valid_nodemask(nodes))
177		return -EINVAL;
178	pol->v.nodes = *nodes;
179	return 0;
180}
181
182/* Create a new policy */
183static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
184				  nodemask_t *nodes)
185{
186	struct mempolicy *policy;
187	nodemask_t cpuset_context_nmask;
188	int ret;
189
190	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
191		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
192
193	if (mode == MPOL_DEFAULT) {
194		if (nodes && !nodes_empty(*nodes))
195			return ERR_PTR(-EINVAL);
196		return NULL;	/* simply delete any existing policy */
197	}
198	VM_BUG_ON(!nodes);
199
200	/*
201	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
202	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
203	 * All other modes require a valid pointer to a non-empty nodemask.
204	 */
205	if (mode == MPOL_PREFERRED) {
206		if (nodes_empty(*nodes)) {
207			if (((flags & MPOL_F_STATIC_NODES) ||
208			     (flags & MPOL_F_RELATIVE_NODES)))
209				return ERR_PTR(-EINVAL);
210			nodes = NULL;	/* flag local alloc */
211		}
212	} else if (nodes_empty(*nodes))
213		return ERR_PTR(-EINVAL);
214	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
215	if (!policy)
216		return ERR_PTR(-ENOMEM);
217	atomic_set(&policy->refcnt, 1);
218	policy->mode = mode;
219	policy->flags = flags;
220
221	if (nodes) {
222		/*
223		 * cpuset related setup doesn't apply to local allocation
224		 */
225		cpuset_update_task_memory_state();
226		if (flags & MPOL_F_RELATIVE_NODES)
227			mpol_relative_nodemask(&cpuset_context_nmask, nodes,
228					       &cpuset_current_mems_allowed);
229		else
230			nodes_and(cpuset_context_nmask, *nodes,
231				  cpuset_current_mems_allowed);
232		if (mpol_store_user_nodemask(policy))
233			policy->w.user_nodemask = *nodes;
234		else
235			policy->w.cpuset_mems_allowed =
236						cpuset_mems_allowed(current);
237	}
238
239	ret = mpol_ops[mode].create(policy,
240				nodes ? &cpuset_context_nmask : NULL);
241	if (ret < 0) {
242		kmem_cache_free(policy_cache, policy);
243		return ERR_PTR(ret);
244	}
245	return policy;
246}
247
248/* Slow path of a mpol destructor. */
249void __mpol_put(struct mempolicy *p)
250{
251	if (!atomic_dec_and_test(&p->refcnt))
252		return;
253	kmem_cache_free(policy_cache, p);
254}
255
256static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
257{
258}
259
260static void mpol_rebind_nodemask(struct mempolicy *pol,
261				 const nodemask_t *nodes)
262{
263	nodemask_t tmp;
264
265	if (pol->flags & MPOL_F_STATIC_NODES)
266		nodes_and(tmp, pol->w.user_nodemask, *nodes);
267	else if (pol->flags & MPOL_F_RELATIVE_NODES)
268		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
269	else {
270		nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
271			    *nodes);
272		pol->w.cpuset_mems_allowed = *nodes;
273	}
274
275	pol->v.nodes = tmp;
276	if (!node_isset(current->il_next, tmp)) {
277		current->il_next = next_node(current->il_next, tmp);
278		if (current->il_next >= MAX_NUMNODES)
279			current->il_next = first_node(tmp);
280		if (current->il_next >= MAX_NUMNODES)
281			current->il_next = numa_node_id();
282	}
283}
284
285static void mpol_rebind_preferred(struct mempolicy *pol,
286				  const nodemask_t *nodes)
287{
288	nodemask_t tmp;
289
290	if (pol->flags & MPOL_F_STATIC_NODES) {
291		int node = first_node(pol->w.user_nodemask);
292
293		if (node_isset(node, *nodes))
294			pol->v.preferred_node = node;
295		else
296			pol->v.preferred_node = -1;
297	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
298		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
299		pol->v.preferred_node = first_node(tmp);
300	} else if (pol->v.preferred_node != -1) {
301		pol->v.preferred_node = node_remap(pol->v.preferred_node,
302						   pol->w.cpuset_mems_allowed,
303						   *nodes);
304		pol->w.cpuset_mems_allowed = *nodes;
305	}
306}
307
308/* Migrate a policy to a different set of nodes */
309static void mpol_rebind_policy(struct mempolicy *pol,
310			       const nodemask_t *newmask)
311{
312	if (!pol)
313		return;
314	if (!mpol_store_user_nodemask(pol) &&
315	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
316		return;
317	mpol_ops[pol->mode].rebind(pol, newmask);
318}
319
320/*
321 * Wrapper for mpol_rebind_policy() that just requires task
322 * pointer, and updates task mempolicy.
323 */
324
325void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
326{
327	mpol_rebind_policy(tsk->mempolicy, new);
328}
329
330/*
331 * Rebind each vma in mm to new nodemask.
332 *
333 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
334 */
335
336void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
337{
338	struct vm_area_struct *vma;
339
340	down_write(&mm->mmap_sem);
341	for (vma = mm->mmap; vma; vma = vma->vm_next)
342		mpol_rebind_policy(vma->vm_policy, new);
343	up_write(&mm->mmap_sem);
344}
345
346static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
347	[MPOL_DEFAULT] = {
348		.rebind = mpol_rebind_default,
349	},
350	[MPOL_INTERLEAVE] = {
351		.create = mpol_new_interleave,
352		.rebind = mpol_rebind_nodemask,
353	},
354	[MPOL_PREFERRED] = {
355		.create = mpol_new_preferred,
356		.rebind = mpol_rebind_preferred,
357	},
358	[MPOL_BIND] = {
359		.create = mpol_new_bind,
360		.rebind = mpol_rebind_nodemask,
361	},
362};
363
364static void gather_stats(struct page *, void *, int pte_dirty);
365static void migrate_page_add(struct page *page, struct list_head *pagelist,
366				unsigned long flags);
367
368/* Scan through pages checking if pages follow certain conditions. */
369static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
370		unsigned long addr, unsigned long end,
371		const nodemask_t *nodes, unsigned long flags,
372		void *private)
373{
374	pte_t *orig_pte;
375	pte_t *pte;
376	spinlock_t *ptl;
377
378	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
379	do {
380		struct page *page;
381		int nid;
382
383		if (!pte_present(*pte))
384			continue;
385		page = vm_normal_page(vma, addr, *pte);
386		if (!page)
387			continue;
388		/*
389		 * The check for PageReserved here is important to avoid
390		 * handling zero pages and other pages that may have been
391		 * marked special by the system.
392		 *
393		 * If the PageReserved would not be checked here then f.e.
394		 * the location of the zero page could have an influence
395		 * on MPOL_MF_STRICT, zero pages would be counted for
396		 * the per node stats, and there would be useless attempts
397		 * to put zero pages on the migration list.
398		 */
399		if (PageReserved(page))
400			continue;
401		nid = page_to_nid(page);
402		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
403			continue;
404
405		if (flags & MPOL_MF_STATS)
406			gather_stats(page, private, pte_dirty(*pte));
407		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
408			migrate_page_add(page, private, flags);
409		else
410			break;
411	} while (pte++, addr += PAGE_SIZE, addr != end);
412	pte_unmap_unlock(orig_pte, ptl);
413	return addr != end;
414}
415
416static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
417		unsigned long addr, unsigned long end,
418		const nodemask_t *nodes, unsigned long flags,
419		void *private)
420{
421	pmd_t *pmd;
422	unsigned long next;
423
424	pmd = pmd_offset(pud, addr);
425	do {
426		next = pmd_addr_end(addr, end);
427		if (pmd_none_or_clear_bad(pmd))
428			continue;
429		if (check_pte_range(vma, pmd, addr, next, nodes,
430				    flags, private))
431			return -EIO;
432	} while (pmd++, addr = next, addr != end);
433	return 0;
434}
435
436static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
437		unsigned long addr, unsigned long end,
438		const nodemask_t *nodes, unsigned long flags,
439		void *private)
440{
441	pud_t *pud;
442	unsigned long next;
443
444	pud = pud_offset(pgd, addr);
445	do {
446		next = pud_addr_end(addr, end);
447		if (pud_none_or_clear_bad(pud))
448			continue;
449		if (check_pmd_range(vma, pud, addr, next, nodes,
450				    flags, private))
451			return -EIO;
452	} while (pud++, addr = next, addr != end);
453	return 0;
454}
455
456static inline int check_pgd_range(struct vm_area_struct *vma,
457		unsigned long addr, unsigned long end,
458		const nodemask_t *nodes, unsigned long flags,
459		void *private)
460{
461	pgd_t *pgd;
462	unsigned long next;
463
464	pgd = pgd_offset(vma->vm_mm, addr);
465	do {
466		next = pgd_addr_end(addr, end);
467		if (pgd_none_or_clear_bad(pgd))
468			continue;
469		if (check_pud_range(vma, pgd, addr, next, nodes,
470				    flags, private))
471			return -EIO;
472	} while (pgd++, addr = next, addr != end);
473	return 0;
474}
475
476/*
477 * Check if all pages in a range are on a set of nodes.
478 * If pagelist != NULL then isolate pages from the LRU and
479 * put them on the pagelist.
480 */
481static struct vm_area_struct *
482check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
483		const nodemask_t *nodes, unsigned long flags, void *private)
484{
485	int err;
486	struct vm_area_struct *first, *vma, *prev;
487
488	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
489
490		err = migrate_prep();
491		if (err)
492			return ERR_PTR(err);
493	}
494
495	first = find_vma(mm, start);
496	if (!first)
497		return ERR_PTR(-EFAULT);
498	prev = NULL;
499	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
500		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
501			if (!vma->vm_next && vma->vm_end < end)
502				return ERR_PTR(-EFAULT);
503			if (prev && prev->vm_end < vma->vm_start)
504				return ERR_PTR(-EFAULT);
505		}
506		if (!is_vm_hugetlb_page(vma) &&
507		    ((flags & MPOL_MF_STRICT) ||
508		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
509				vma_migratable(vma)))) {
510			unsigned long endvma = vma->vm_end;
511
512			if (endvma > end)
513				endvma = end;
514			if (vma->vm_start > start)
515				start = vma->vm_start;
516			err = check_pgd_range(vma, start, endvma, nodes,
517						flags, private);
518			if (err) {
519				first = ERR_PTR(err);
520				break;
521			}
522		}
523		prev = vma;
524	}
525	return first;
526}
527
528/* Apply policy to a single VMA */
529static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
530{
531	int err = 0;
532	struct mempolicy *old = vma->vm_policy;
533
534	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
535		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
536		 vma->vm_ops, vma->vm_file,
537		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
538
539	if (vma->vm_ops && vma->vm_ops->set_policy)
540		err = vma->vm_ops->set_policy(vma, new);
541	if (!err) {
542		mpol_get(new);
543		vma->vm_policy = new;
544		mpol_put(old);
545	}
546	return err;
547}
548
549/* Step 2: apply policy to a range and do splits. */
550static int mbind_range(struct vm_area_struct *vma, unsigned long start,
551		       unsigned long end, struct mempolicy *new)
552{
553	struct vm_area_struct *next;
554	int err;
555
556	err = 0;
557	for (; vma && vma->vm_start < end; vma = next) {
558		next = vma->vm_next;
559		if (vma->vm_start < start)
560			err = split_vma(vma->vm_mm, vma, start, 1);
561		if (!err && vma->vm_end > end)
562			err = split_vma(vma->vm_mm, vma, end, 0);
563		if (!err)
564			err = policy_vma(vma, new);
565		if (err)
566			break;
567	}
568	return err;
569}
570
571/*
572 * Update task->flags PF_MEMPOLICY bit: set iff non-default
573 * mempolicy.  Allows more rapid checking of this (combined perhaps
574 * with other PF_* flag bits) on memory allocation hot code paths.
575 *
576 * If called from outside this file, the task 'p' should -only- be
577 * a newly forked child not yet visible on the task list, because
578 * manipulating the task flags of a visible task is not safe.
579 *
580 * The above limitation is why this routine has the funny name
581 * mpol_fix_fork_child_flag().
582 *
583 * It is also safe to call this with a task pointer of current,
584 * which the static wrapper mpol_set_task_struct_flag() does,
585 * for use within this file.
586 */
587
588void mpol_fix_fork_child_flag(struct task_struct *p)
589{
590	if (p->mempolicy)
591		p->flags |= PF_MEMPOLICY;
592	else
593		p->flags &= ~PF_MEMPOLICY;
594}
595
596static void mpol_set_task_struct_flag(void)
597{
598	mpol_fix_fork_child_flag(current);
599}
600
601/* Set the process memory policy */
602static long do_set_mempolicy(unsigned short mode, unsigned short flags,
603			     nodemask_t *nodes)
604{
605	struct mempolicy *new;
606	struct mm_struct *mm = current->mm;
607
608	new = mpol_new(mode, flags, nodes);
609	if (IS_ERR(new))
610		return PTR_ERR(new);
611
612	/*
613	 * prevent changing our mempolicy while show_numa_maps()
614	 * is using it.
615	 * Note:  do_set_mempolicy() can be called at init time
616	 * with no 'mm'.
617	 */
618	if (mm)
619		down_write(&mm->mmap_sem);
620	mpol_put(current->mempolicy);
621	current->mempolicy = new;
622	mpol_set_task_struct_flag();
623	if (new && new->mode == MPOL_INTERLEAVE &&
624	    nodes_weight(new->v.nodes))
625		current->il_next = first_node(new->v.nodes);
626	if (mm)
627		up_write(&mm->mmap_sem);
628
629	return 0;
630}
631
632/*
633 * Return nodemask for policy for get_mempolicy() query
634 */
635static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
636{
637	nodes_clear(*nodes);
638	if (p == &default_policy)
639		return;
640
641	switch (p->mode) {
642	case MPOL_BIND:
643		/* Fall through */
644	case MPOL_INTERLEAVE:
645		*nodes = p->v.nodes;
646		break;
647	case MPOL_PREFERRED:
648		/* or use current node instead of memory_map? */
649		if (p->v.preferred_node < 0)
650			*nodes = node_states[N_HIGH_MEMORY];
651		else
652			node_set(p->v.preferred_node, *nodes);
653		break;
654	default:
655		BUG();
656	}
657}
658
659static int lookup_node(struct mm_struct *mm, unsigned long addr)
660{
661	struct page *p;
662	int err;
663
664	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
665	if (err >= 0) {
666		err = page_to_nid(p);
667		put_page(p);
668	}
669	return err;
670}
671
672/* Retrieve NUMA policy */
673static long do_get_mempolicy(int *policy, nodemask_t *nmask,
674			     unsigned long addr, unsigned long flags)
675{
676	int err;
677	struct mm_struct *mm = current->mm;
678	struct vm_area_struct *vma = NULL;
679	struct mempolicy *pol = current->mempolicy;
680
681	cpuset_update_task_memory_state();
682	if (flags &
683		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
684		return -EINVAL;
685
686	if (flags & MPOL_F_MEMS_ALLOWED) {
687		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
688			return -EINVAL;
689		*policy = 0;	/* just so it's initialized */
690		*nmask  = cpuset_current_mems_allowed;
691		return 0;
692	}
693
694	if (flags & MPOL_F_ADDR) {
695		/*
696		 * Do NOT fall back to task policy if the
697		 * vma/shared policy at addr is NULL.  We
698		 * want to return MPOL_DEFAULT in this case.
699		 */
700		down_read(&mm->mmap_sem);
701		vma = find_vma_intersection(mm, addr, addr+1);
702		if (!vma) {
703			up_read(&mm->mmap_sem);
704			return -EFAULT;
705		}
706		if (vma->vm_ops && vma->vm_ops->get_policy)
707			pol = vma->vm_ops->get_policy(vma, addr);
708		else
709			pol = vma->vm_policy;
710	} else if (addr)
711		return -EINVAL;
712
713	if (!pol)
714		pol = &default_policy;	/* indicates default behavior */
715
716	if (flags & MPOL_F_NODE) {
717		if (flags & MPOL_F_ADDR) {
718			err = lookup_node(mm, addr);
719			if (err < 0)
720				goto out;
721			*policy = err;
722		} else if (pol == current->mempolicy &&
723				pol->mode == MPOL_INTERLEAVE) {
724			*policy = current->il_next;
725		} else {
726			err = -EINVAL;
727			goto out;
728		}
729	} else {
730		*policy = pol == &default_policy ? MPOL_DEFAULT :
731						pol->mode;
732		*policy |= pol->flags;
733	}
734
735	if (vma) {
736		up_read(&current->mm->mmap_sem);
737		vma = NULL;
738	}
739
740	err = 0;
741	if (nmask)
742		get_policy_nodemask(pol, nmask);
743
744 out:
745	mpol_cond_put(pol);
746	if (vma)
747		up_read(&current->mm->mmap_sem);
748	return err;
749}
750
751#ifdef CONFIG_MIGRATION
752/*
753 * page migration
754 */
755static void migrate_page_add(struct page *page, struct list_head *pagelist,
756				unsigned long flags)
757{
758	/*
759	 * Avoid migrating a page that is shared with others.
760	 */
761	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
762		isolate_lru_page(page, pagelist);
763}
764
765static struct page *new_node_page(struct page *page, unsigned long node, int **x)
766{
767	return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
768}
769
770/*
771 * Migrate pages from one node to a target node.
772 * Returns error or the number of pages not migrated.
773 */
774static int migrate_to_node(struct mm_struct *mm, int source, int dest,
775			   int flags)
776{
777	nodemask_t nmask;
778	LIST_HEAD(pagelist);
779	int err = 0;
780
781	nodes_clear(nmask);
782	node_set(source, nmask);
783
784	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
785			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
786
787	if (!list_empty(&pagelist))
788		err = migrate_pages(&pagelist, new_node_page, dest);
789
790	return err;
791}
792
793/*
794 * Move pages between the two nodesets so as to preserve the physical
795 * layout as much as possible.
796 *
797 * Returns the number of page that could not be moved.
798 */
799int do_migrate_pages(struct mm_struct *mm,
800	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
801{
802	LIST_HEAD(pagelist);
803	int busy = 0;
804	int err = 0;
805	nodemask_t tmp;
806
807  	down_read(&mm->mmap_sem);
808
809	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
810	if (err)
811		goto out;
812
813/*
814 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
815 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
816 * bit in 'tmp', and return that <source, dest> pair for migration.
817 * The pair of nodemasks 'to' and 'from' define the map.
818 *
819 * If no pair of bits is found that way, fallback to picking some
820 * pair of 'source' and 'dest' bits that are not the same.  If the
821 * 'source' and 'dest' bits are the same, this represents a node
822 * that will be migrating to itself, so no pages need move.
823 *
824 * If no bits are left in 'tmp', or if all remaining bits left
825 * in 'tmp' correspond to the same bit in 'to', return false
826 * (nothing left to migrate).
827 *
828 * This lets us pick a pair of nodes to migrate between, such that
829 * if possible the dest node is not already occupied by some other
830 * source node, minimizing the risk of overloading the memory on a
831 * node that would happen if we migrated incoming memory to a node
832 * before migrating outgoing memory source that same node.
833 *
834 * A single scan of tmp is sufficient.  As we go, we remember the
835 * most recent <s, d> pair that moved (s != d).  If we find a pair
836 * that not only moved, but what's better, moved to an empty slot
837 * (d is not set in tmp), then we break out then, with that pair.
838 * Otherwise when we finish scannng from_tmp, we at least have the
839 * most recent <s, d> pair that moved.  If we get all the way through
840 * the scan of tmp without finding any node that moved, much less
841 * moved to an empty node, then there is nothing left worth migrating.
842 */
843
844	tmp = *from_nodes;
845	while (!nodes_empty(tmp)) {
846		int s,d;
847		int source = -1;
848		int dest = 0;
849
850		for_each_node_mask(s, tmp) {
851			d = node_remap(s, *from_nodes, *to_nodes);
852			if (s == d)
853				continue;
854
855			source = s;	/* Node moved. Memorize */
856			dest = d;
857
858			/* dest not in remaining from nodes? */
859			if (!node_isset(dest, tmp))
860				break;
861		}
862		if (source == -1)
863			break;
864
865		node_clear(source, tmp);
866		err = migrate_to_node(mm, source, dest, flags);
867		if (err > 0)
868			busy += err;
869		if (err < 0)
870			break;
871	}
872out:
873	up_read(&mm->mmap_sem);
874	if (err < 0)
875		return err;
876	return busy;
877
878}
879
880/*
881 * Allocate a new page for page migration based on vma policy.
882 * Start assuming that page is mapped by vma pointed to by @private.
883 * Search forward from there, if not.  N.B., this assumes that the
884 * list of pages handed to migrate_pages()--which is how we get here--
885 * is in virtual address order.
886 */
887static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
888{
889	struct vm_area_struct *vma = (struct vm_area_struct *)private;
890	unsigned long uninitialized_var(address);
891
892	while (vma) {
893		address = page_address_in_vma(page, vma);
894		if (address != -EFAULT)
895			break;
896		vma = vma->vm_next;
897	}
898
899	/*
900	 * if !vma, alloc_page_vma() will use task or system default policy
901	 */
902	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
903}
904#else
905
906static void migrate_page_add(struct page *page, struct list_head *pagelist,
907				unsigned long flags)
908{
909}
910
911int do_migrate_pages(struct mm_struct *mm,
912	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
913{
914	return -ENOSYS;
915}
916
917static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
918{
919	return NULL;
920}
921#endif
922
923static long do_mbind(unsigned long start, unsigned long len,
924		     unsigned short mode, unsigned short mode_flags,
925		     nodemask_t *nmask, unsigned long flags)
926{
927	struct vm_area_struct *vma;
928	struct mm_struct *mm = current->mm;
929	struct mempolicy *new;
930	unsigned long end;
931	int err;
932	LIST_HEAD(pagelist);
933
934	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
935				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
936		return -EINVAL;
937	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
938		return -EPERM;
939
940	if (start & ~PAGE_MASK)
941		return -EINVAL;
942
943	if (mode == MPOL_DEFAULT)
944		flags &= ~MPOL_MF_STRICT;
945
946	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
947	end = start + len;
948
949	if (end < start)
950		return -EINVAL;
951	if (end == start)
952		return 0;
953
954	new = mpol_new(mode, mode_flags, nmask);
955	if (IS_ERR(new))
956		return PTR_ERR(new);
957
958	/*
959	 * If we are using the default policy then operation
960	 * on discontinuous address spaces is okay after all
961	 */
962	if (!new)
963		flags |= MPOL_MF_DISCONTIG_OK;
964
965	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
966		 start, start + len, mode, mode_flags,
967		 nmask ? nodes_addr(*nmask)[0] : -1);
968
969	down_write(&mm->mmap_sem);
970	vma = check_range(mm, start, end, nmask,
971			  flags | MPOL_MF_INVERT, &pagelist);
972
973	err = PTR_ERR(vma);
974	if (!IS_ERR(vma)) {
975		int nr_failed = 0;
976
977		err = mbind_range(vma, start, end, new);
978
979		if (!list_empty(&pagelist))
980			nr_failed = migrate_pages(&pagelist, new_vma_page,
981						(unsigned long)vma);
982
983		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
984			err = -EIO;
985	}
986
987	up_write(&mm->mmap_sem);
988	mpol_put(new);
989	return err;
990}
991
992/*
993 * User space interface with variable sized bitmaps for nodelists.
994 */
995
996/* Copy a node mask from user space. */
997static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
998		     unsigned long maxnode)
999{
1000	unsigned long k;
1001	unsigned long nlongs;
1002	unsigned long endmask;
1003
1004	--maxnode;
1005	nodes_clear(*nodes);
1006	if (maxnode == 0 || !nmask)
1007		return 0;
1008	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1009		return -EINVAL;
1010
1011	nlongs = BITS_TO_LONGS(maxnode);
1012	if ((maxnode % BITS_PER_LONG) == 0)
1013		endmask = ~0UL;
1014	else
1015		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1016
1017	/* When the user specified more nodes than supported just check
1018	   if the non supported part is all zero. */
1019	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1020		if (nlongs > PAGE_SIZE/sizeof(long))
1021			return -EINVAL;
1022		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1023			unsigned long t;
1024			if (get_user(t, nmask + k))
1025				return -EFAULT;
1026			if (k == nlongs - 1) {
1027				if (t & endmask)
1028					return -EINVAL;
1029			} else if (t)
1030				return -EINVAL;
1031		}
1032		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1033		endmask = ~0UL;
1034	}
1035
1036	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1037		return -EFAULT;
1038	nodes_addr(*nodes)[nlongs-1] &= endmask;
1039	return 0;
1040}
1041
1042/* Copy a kernel node mask to user space */
1043static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1044			      nodemask_t *nodes)
1045{
1046	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1047	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1048
1049	if (copy > nbytes) {
1050		if (copy > PAGE_SIZE)
1051			return -EINVAL;
1052		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1053			return -EFAULT;
1054		copy = nbytes;
1055	}
1056	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1057}
1058
1059asmlinkage long sys_mbind(unsigned long start, unsigned long len,
1060			unsigned long mode,
1061			unsigned long __user *nmask, unsigned long maxnode,
1062			unsigned flags)
1063{
1064	nodemask_t nodes;
1065	int err;
1066	unsigned short mode_flags;
1067
1068	mode_flags = mode & MPOL_MODE_FLAGS;
1069	mode &= ~MPOL_MODE_FLAGS;
1070	if (mode >= MPOL_MAX)
1071		return -EINVAL;
1072	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1073	    (mode_flags & MPOL_F_RELATIVE_NODES))
1074		return -EINVAL;
1075	err = get_nodes(&nodes, nmask, maxnode);
1076	if (err)
1077		return err;
1078	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1079}
1080
1081/* Set the process memory policy */
1082asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
1083		unsigned long maxnode)
1084{
1085	int err;
1086	nodemask_t nodes;
1087	unsigned short flags;
1088
1089	flags = mode & MPOL_MODE_FLAGS;
1090	mode &= ~MPOL_MODE_FLAGS;
1091	if ((unsigned int)mode >= MPOL_MAX)
1092		return -EINVAL;
1093	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1094		return -EINVAL;
1095	err = get_nodes(&nodes, nmask, maxnode);
1096	if (err)
1097		return err;
1098	return do_set_mempolicy(mode, flags, &nodes);
1099}
1100
1101asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
1102		const unsigned long __user *old_nodes,
1103		const unsigned long __user *new_nodes)
1104{
1105	struct mm_struct *mm;
1106	struct task_struct *task;
1107	nodemask_t old;
1108	nodemask_t new;
1109	nodemask_t task_nodes;
1110	int err;
1111
1112	err = get_nodes(&old, old_nodes, maxnode);
1113	if (err)
1114		return err;
1115
1116	err = get_nodes(&new, new_nodes, maxnode);
1117	if (err)
1118		return err;
1119
1120	/* Find the mm_struct */
1121	read_lock(&tasklist_lock);
1122	task = pid ? find_task_by_vpid(pid) : current;
1123	if (!task) {
1124		read_unlock(&tasklist_lock);
1125		return -ESRCH;
1126	}
1127	mm = get_task_mm(task);
1128	read_unlock(&tasklist_lock);
1129
1130	if (!mm)
1131		return -EINVAL;
1132
1133	/*
1134	 * Check if this process has the right to modify the specified
1135	 * process. The right exists if the process has administrative
1136	 * capabilities, superuser privileges or the same
1137	 * userid as the target process.
1138	 */
1139	if ((current->euid != task->suid) && (current->euid != task->uid) &&
1140	    (current->uid != task->suid) && (current->uid != task->uid) &&
1141	    !capable(CAP_SYS_NICE)) {
1142		err = -EPERM;
1143		goto out;
1144	}
1145
1146	task_nodes = cpuset_mems_allowed(task);
1147	/* Is the user allowed to access the target nodes? */
1148	if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1149		err = -EPERM;
1150		goto out;
1151	}
1152
1153	if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1154		err = -EINVAL;
1155		goto out;
1156	}
1157
1158	err = security_task_movememory(task);
1159	if (err)
1160		goto out;
1161
1162	err = do_migrate_pages(mm, &old, &new,
1163		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1164out:
1165	mmput(mm);
1166	return err;
1167}
1168
1169
1170/* Retrieve NUMA policy */
1171asmlinkage long sys_get_mempolicy(int __user *policy,
1172				unsigned long __user *nmask,
1173				unsigned long maxnode,
1174				unsigned long addr, unsigned long flags)
1175{
1176	int err;
1177	int uninitialized_var(pval);
1178	nodemask_t nodes;
1179
1180	if (nmask != NULL && maxnode < MAX_NUMNODES)
1181		return -EINVAL;
1182
1183	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1184
1185	if (err)
1186		return err;
1187
1188	if (policy && put_user(pval, policy))
1189		return -EFAULT;
1190
1191	if (nmask)
1192		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1193
1194	return err;
1195}
1196
1197#ifdef CONFIG_COMPAT
1198
1199asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1200				     compat_ulong_t __user *nmask,
1201				     compat_ulong_t maxnode,
1202				     compat_ulong_t addr, compat_ulong_t flags)
1203{
1204	long err;
1205	unsigned long __user *nm = NULL;
1206	unsigned long nr_bits, alloc_size;
1207	DECLARE_BITMAP(bm, MAX_NUMNODES);
1208
1209	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1210	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1211
1212	if (nmask)
1213		nm = compat_alloc_user_space(alloc_size);
1214
1215	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1216
1217	if (!err && nmask) {
1218		err = copy_from_user(bm, nm, alloc_size);
1219		/* ensure entire bitmap is zeroed */
1220		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1221		err |= compat_put_bitmap(nmask, bm, nr_bits);
1222	}
1223
1224	return err;
1225}
1226
1227asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1228				     compat_ulong_t maxnode)
1229{
1230	long err = 0;
1231	unsigned long __user *nm = NULL;
1232	unsigned long nr_bits, alloc_size;
1233	DECLARE_BITMAP(bm, MAX_NUMNODES);
1234
1235	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1236	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1237
1238	if (nmask) {
1239		err = compat_get_bitmap(bm, nmask, nr_bits);
1240		nm = compat_alloc_user_space(alloc_size);
1241		err |= copy_to_user(nm, bm, alloc_size);
1242	}
1243
1244	if (err)
1245		return -EFAULT;
1246
1247	return sys_set_mempolicy(mode, nm, nr_bits+1);
1248}
1249
1250asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1251			     compat_ulong_t mode, compat_ulong_t __user *nmask,
1252			     compat_ulong_t maxnode, compat_ulong_t flags)
1253{
1254	long err = 0;
1255	unsigned long __user *nm = NULL;
1256	unsigned long nr_bits, alloc_size;
1257	nodemask_t bm;
1258
1259	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1260	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1261
1262	if (nmask) {
1263		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1264		nm = compat_alloc_user_space(alloc_size);
1265		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1266	}
1267
1268	if (err)
1269		return -EFAULT;
1270
1271	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1272}
1273
1274#endif
1275
1276/*
1277 * get_vma_policy(@task, @vma, @addr)
1278 * @task - task for fallback if vma policy == default
1279 * @vma   - virtual memory area whose policy is sought
1280 * @addr  - address in @vma for shared policy lookup
1281 *
1282 * Returns effective policy for a VMA at specified address.
1283 * Falls back to @task or system default policy, as necessary.
1284 * Current or other task's task mempolicy and non-shared vma policies
1285 * are protected by the task's mmap_sem, which must be held for read by
1286 * the caller.
1287 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1288 * count--added by the get_policy() vm_op, as appropriate--to protect against
1289 * freeing by another task.  It is the caller's responsibility to free the
1290 * extra reference for shared policies.
1291 */
1292static struct mempolicy *get_vma_policy(struct task_struct *task,
1293		struct vm_area_struct *vma, unsigned long addr)
1294{
1295	struct mempolicy *pol = task->mempolicy;
1296
1297	if (vma) {
1298		if (vma->vm_ops && vma->vm_ops->get_policy) {
1299			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1300									addr);
1301			if (vpol)
1302				pol = vpol;
1303		} else if (vma->vm_policy)
1304			pol = vma->vm_policy;
1305	}
1306	if (!pol)
1307		pol = &default_policy;
1308	return pol;
1309}
1310
1311/*
1312 * Return a nodemask representing a mempolicy for filtering nodes for
1313 * page allocation
1314 */
1315static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1316{
1317	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1318	if (unlikely(policy->mode == MPOL_BIND) &&
1319			gfp_zone(gfp) >= policy_zone &&
1320			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1321		return &policy->v.nodes;
1322
1323	return NULL;
1324}
1325
1326/* Return a zonelist indicated by gfp for node representing a mempolicy */
1327static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1328{
1329	int nd;
1330
1331	switch (policy->mode) {
1332	case MPOL_PREFERRED:
1333		nd = policy->v.preferred_node;
1334		if (nd < 0)
1335			nd = numa_node_id();
1336		break;
1337	case MPOL_BIND:
1338		/*
1339		 * Normally, MPOL_BIND allocations are node-local within the
1340		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1341		 * current node is part of the mask, we use the zonelist for
1342		 * the first node in the mask instead.
1343		 */
1344		nd = numa_node_id();
1345		if (unlikely(gfp & __GFP_THISNODE) &&
1346				unlikely(!node_isset(nd, policy->v.nodes)))
1347			nd = first_node(policy->v.nodes);
1348		break;
1349	case MPOL_INTERLEAVE: /* should not happen */
1350		nd = numa_node_id();
1351		break;
1352	default:
1353		nd = 0;
1354		BUG();
1355	}
1356	return node_zonelist(nd, gfp);
1357}
1358
1359/* Do dynamic interleaving for a process */
1360static unsigned interleave_nodes(struct mempolicy *policy)
1361{
1362	unsigned nid, next;
1363	struct task_struct *me = current;
1364
1365	nid = me->il_next;
1366	next = next_node(nid, policy->v.nodes);
1367	if (next >= MAX_NUMNODES)
1368		next = first_node(policy->v.nodes);
1369	if (next < MAX_NUMNODES)
1370		me->il_next = next;
1371	return nid;
1372}
1373
1374/*
1375 * Depending on the memory policy provide a node from which to allocate the
1376 * next slab entry.
1377 * @policy must be protected by freeing by the caller.  If @policy is
1378 * the current task's mempolicy, this protection is implicit, as only the
1379 * task can change it's policy.  The system default policy requires no
1380 * such protection.
1381 */
1382unsigned slab_node(struct mempolicy *policy)
1383{
1384	if (!policy)
1385		return numa_node_id();
1386
1387	switch (policy->mode) {
1388	case MPOL_PREFERRED:
1389		if (unlikely(policy->v.preferred_node >= 0))
1390			return policy->v.preferred_node;
1391		return numa_node_id();
1392
1393	case MPOL_INTERLEAVE:
1394		return interleave_nodes(policy);
1395
1396	case MPOL_BIND: {
1397		/*
1398		 * Follow bind policy behavior and start allocation at the
1399		 * first node.
1400		 */
1401		struct zonelist *zonelist;
1402		struct zone *zone;
1403		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1404		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1405		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1406							&policy->v.nodes,
1407							&zone);
1408		return zone->node;
1409	}
1410
1411	default:
1412		BUG();
1413	}
1414}
1415
1416/* Do static interleaving for a VMA with known offset. */
1417static unsigned offset_il_node(struct mempolicy *pol,
1418		struct vm_area_struct *vma, unsigned long off)
1419{
1420	unsigned nnodes = nodes_weight(pol->v.nodes);
1421	unsigned target;
1422	int c;
1423	int nid = -1;
1424
1425	if (!nnodes)
1426		return numa_node_id();
1427	target = (unsigned int)off % nnodes;
1428	c = 0;
1429	do {
1430		nid = next_node(nid, pol->v.nodes);
1431		c++;
1432	} while (c <= target);
1433	return nid;
1434}
1435
1436/* Determine a node number for interleave */
1437static inline unsigned interleave_nid(struct mempolicy *pol,
1438		 struct vm_area_struct *vma, unsigned long addr, int shift)
1439{
1440	if (vma) {
1441		unsigned long off;
1442
1443		/*
1444		 * for small pages, there is no difference between
1445		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1446		 * for huge pages, since vm_pgoff is in units of small
1447		 * pages, we need to shift off the always 0 bits to get
1448		 * a useful offset.
1449		 */
1450		BUG_ON(shift < PAGE_SHIFT);
1451		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1452		off += (addr - vma->vm_start) >> shift;
1453		return offset_il_node(pol, vma, off);
1454	} else
1455		return interleave_nodes(pol);
1456}
1457
1458#ifdef CONFIG_HUGETLBFS
1459/*
1460 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1461 * @vma = virtual memory area whose policy is sought
1462 * @addr = address in @vma for shared policy lookup and interleave policy
1463 * @gfp_flags = for requested zone
1464 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1465 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1466 *
1467 * Returns a zonelist suitable for a huge page allocation and a pointer
1468 * to the struct mempolicy for conditional unref after allocation.
1469 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1470 * @nodemask for filtering the zonelist.
1471 */
1472struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1473				gfp_t gfp_flags, struct mempolicy **mpol,
1474				nodemask_t **nodemask)
1475{
1476	struct zonelist *zl;
1477
1478	*mpol = get_vma_policy(current, vma, addr);
1479	*nodemask = NULL;	/* assume !MPOL_BIND */
1480
1481	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1482		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1483						HPAGE_SHIFT), gfp_flags);
1484	} else {
1485		zl = policy_zonelist(gfp_flags, *mpol);
1486		if ((*mpol)->mode == MPOL_BIND)
1487			*nodemask = &(*mpol)->v.nodes;
1488	}
1489	return zl;
1490}
1491#endif
1492
1493/* Allocate a page in interleaved policy.
1494   Own path because it needs to do special accounting. */
1495static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1496					unsigned nid)
1497{
1498	struct zonelist *zl;
1499	struct page *page;
1500
1501	zl = node_zonelist(nid, gfp);
1502	page = __alloc_pages(gfp, order, zl);
1503	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1504		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1505	return page;
1506}
1507
1508/**
1509 * 	alloc_page_vma	- Allocate a page for a VMA.
1510 *
1511 * 	@gfp:
1512 *      %GFP_USER    user allocation.
1513 *      %GFP_KERNEL  kernel allocations,
1514 *      %GFP_HIGHMEM highmem/user allocations,
1515 *      %GFP_FS      allocation should not call back into a file system.
1516 *      %GFP_ATOMIC  don't sleep.
1517 *
1518 * 	@vma:  Pointer to VMA or NULL if not available.
1519 *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1520 *
1521 * 	This function allocates a page from the kernel page pool and applies
1522 *	a NUMA policy associated with the VMA or the current process.
1523 *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1524 *	mm_struct of the VMA to prevent it from going away. Should be used for
1525 *	all allocations for pages that will be mapped into
1526 * 	user space. Returns NULL when no page can be allocated.
1527 *
1528 *	Should be called with the mm_sem of the vma hold.
1529 */
1530struct page *
1531alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1532{
1533	struct mempolicy *pol = get_vma_policy(current, vma, addr);
1534	struct zonelist *zl;
1535
1536	cpuset_update_task_memory_state();
1537
1538	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1539		unsigned nid;
1540
1541		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1542		mpol_cond_put(pol);
1543		return alloc_page_interleave(gfp, 0, nid);
1544	}
1545	zl = policy_zonelist(gfp, pol);
1546	if (unlikely(mpol_needs_cond_ref(pol))) {
1547		/*
1548		 * slow path: ref counted shared policy
1549		 */
1550		struct page *page =  __alloc_pages_nodemask(gfp, 0,
1551						zl, policy_nodemask(gfp, pol));
1552		__mpol_put(pol);
1553		return page;
1554	}
1555	/*
1556	 * fast path:  default or task policy
1557	 */
1558	return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1559}
1560
1561/**
1562 * 	alloc_pages_current - Allocate pages.
1563 *
1564 *	@gfp:
1565 *		%GFP_USER   user allocation,
1566 *      	%GFP_KERNEL kernel allocation,
1567 *      	%GFP_HIGHMEM highmem allocation,
1568 *      	%GFP_FS     don't call back into a file system.
1569 *      	%GFP_ATOMIC don't sleep.
1570 *	@order: Power of two of allocation size in pages. 0 is a single page.
1571 *
1572 *	Allocate a page from the kernel page pool.  When not in
1573 *	interrupt context and apply the current process NUMA policy.
1574 *	Returns NULL when no page can be allocated.
1575 *
1576 *	Don't call cpuset_update_task_memory_state() unless
1577 *	1) it's ok to take cpuset_sem (can WAIT), and
1578 *	2) allocating for current task (not interrupt).
1579 */
1580struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1581{
1582	struct mempolicy *pol = current->mempolicy;
1583
1584	if ((gfp & __GFP_WAIT) && !in_interrupt())
1585		cpuset_update_task_memory_state();
1586	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1587		pol = &default_policy;
1588
1589	/*
1590	 * No reference counting needed for current->mempolicy
1591	 * nor system default_policy
1592	 */
1593	if (pol->mode == MPOL_INTERLEAVE)
1594		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1595	return __alloc_pages_nodemask(gfp, order,
1596			policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1597}
1598EXPORT_SYMBOL(alloc_pages_current);
1599
1600/*
1601 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1602 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1603 * with the mems_allowed returned by cpuset_mems_allowed().  This
1604 * keeps mempolicies cpuset relative after its cpuset moves.  See
1605 * further kernel/cpuset.c update_nodemask().
1606 */
1607
1608/* Slow path of a mempolicy duplicate */
1609struct mempolicy *__mpol_dup(struct mempolicy *old)
1610{
1611	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1612
1613	if (!new)
1614		return ERR_PTR(-ENOMEM);
1615	if (current_cpuset_is_being_rebound()) {
1616		nodemask_t mems = cpuset_mems_allowed(current);
1617		mpol_rebind_policy(old, &mems);
1618	}
1619	*new = *old;
1620	atomic_set(&new->refcnt, 1);
1621	return new;
1622}
1623
1624/*
1625 * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1626 * eliminate the * MPOL_F_* flags that require conditional ref and
1627 * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1628 * after return.  Use the returned value.
1629 *
1630 * Allows use of a mempolicy for, e.g., multiple allocations with a single
1631 * policy lookup, even if the policy needs/has extra ref on lookup.
1632 * shmem_readahead needs this.
1633 */
1634struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1635						struct mempolicy *frompol)
1636{
1637	if (!mpol_needs_cond_ref(frompol))
1638		return frompol;
1639
1640	*tompol = *frompol;
1641	tompol->flags &= ~MPOL_F_SHARED;	/* copy doesn't need unref */
1642	__mpol_put(frompol);
1643	return tompol;
1644}
1645
1646static int mpol_match_intent(const struct mempolicy *a,
1647			     const struct mempolicy *b)
1648{
1649	if (a->flags != b->flags)
1650		return 0;
1651	if (!mpol_store_user_nodemask(a))
1652		return 1;
1653	return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1654}
1655
1656/* Slow path of a mempolicy comparison */
1657int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1658{
1659	if (!a || !b)
1660		return 0;
1661	if (a->mode != b->mode)
1662		return 0;
1663	if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1664		return 0;
1665	switch (a->mode) {
1666	case MPOL_BIND:
1667		/* Fall through */
1668	case MPOL_INTERLEAVE:
1669		return nodes_equal(a->v.nodes, b->v.nodes);
1670	case MPOL_PREFERRED:
1671		return a->v.preferred_node == b->v.preferred_node;
1672	default:
1673		BUG();
1674		return 0;
1675	}
1676}
1677
1678/*
1679 * Shared memory backing store policy support.
1680 *
1681 * Remember policies even when nobody has shared memory mapped.
1682 * The policies are kept in Red-Black tree linked from the inode.
1683 * They are protected by the sp->lock spinlock, which should be held
1684 * for any accesses to the tree.
1685 */
1686
1687/* lookup first element intersecting start-end */
1688/* Caller holds sp->lock */
1689static struct sp_node *
1690sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1691{
1692	struct rb_node *n = sp->root.rb_node;
1693
1694	while (n) {
1695		struct sp_node *p = rb_entry(n, struct sp_node, nd);
1696
1697		if (start >= p->end)
1698			n = n->rb_right;
1699		else if (end <= p->start)
1700			n = n->rb_left;
1701		else
1702			break;
1703	}
1704	if (!n)
1705		return NULL;
1706	for (;;) {
1707		struct sp_node *w = NULL;
1708		struct rb_node *prev = rb_prev(n);
1709		if (!prev)
1710			break;
1711		w = rb_entry(prev, struct sp_node, nd);
1712		if (w->end <= start)
1713			break;
1714		n = prev;
1715	}
1716	return rb_entry(n, struct sp_node, nd);
1717}
1718
1719/* Insert a new shared policy into the list. */
1720/* Caller holds sp->lock */
1721static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1722{
1723	struct rb_node **p = &sp->root.rb_node;
1724	struct rb_node *parent = NULL;
1725	struct sp_node *nd;
1726
1727	while (*p) {
1728		parent = *p;
1729		nd = rb_entry(parent, struct sp_node, nd);
1730		if (new->start < nd->start)
1731			p = &(*p)->rb_left;
1732		else if (new->end > nd->end)
1733			p = &(*p)->rb_right;
1734		else
1735			BUG();
1736	}
1737	rb_link_node(&new->nd, parent, p);
1738	rb_insert_color(&new->nd, &sp->root);
1739	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1740		 new->policy ? new->policy->mode : 0);
1741}
1742
1743/* Find shared policy intersecting idx */
1744struct mempolicy *
1745mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1746{
1747	struct mempolicy *pol = NULL;
1748	struct sp_node *sn;
1749
1750	if (!sp->root.rb_node)
1751		return NULL;
1752	spin_lock(&sp->lock);
1753	sn = sp_lookup(sp, idx, idx+1);
1754	if (sn) {
1755		mpol_get(sn->policy);
1756		pol = sn->policy;
1757	}
1758	spin_unlock(&sp->lock);
1759	return pol;
1760}
1761
1762static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1763{
1764	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1765	rb_erase(&n->nd, &sp->root);
1766	mpol_put(n->policy);
1767	kmem_cache_free(sn_cache, n);
1768}
1769
1770static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1771				struct mempolicy *pol)
1772{
1773	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1774
1775	if (!n)
1776		return NULL;
1777	n->start = start;
1778	n->end = end;
1779	mpol_get(pol);
1780	pol->flags |= MPOL_F_SHARED;	/* for unref */
1781	n->policy = pol;
1782	return n;
1783}
1784
1785/* Replace a policy range. */
1786static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1787				 unsigned long end, struct sp_node *new)
1788{
1789	struct sp_node *n, *new2 = NULL;
1790
1791restart:
1792	spin_lock(&sp->lock);
1793	n = sp_lookup(sp, start, end);
1794	/* Take care of old policies in the same range. */
1795	while (n && n->start < end) {
1796		struct rb_node *next = rb_next(&n->nd);
1797		if (n->start >= start) {
1798			if (n->end <= end)
1799				sp_delete(sp, n);
1800			else
1801				n->start = end;
1802		} else {
1803			/* Old policy spanning whole new range. */
1804			if (n->end > end) {
1805				if (!new2) {
1806					spin_unlock(&sp->lock);
1807					new2 = sp_alloc(end, n->end, n->policy);
1808					if (!new2)
1809						return -ENOMEM;
1810					goto restart;
1811				}
1812				n->end = start;
1813				sp_insert(sp, new2);
1814				new2 = NULL;
1815				break;
1816			} else
1817				n->end = start;
1818		}
1819		if (!next)
1820			break;
1821		n = rb_entry(next, struct sp_node, nd);
1822	}
1823	if (new)
1824		sp_insert(sp, new);
1825	spin_unlock(&sp->lock);
1826	if (new2) {
1827		mpol_put(new2->policy);
1828		kmem_cache_free(sn_cache, new2);
1829	}
1830	return 0;
1831}
1832
1833void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
1834			unsigned short flags, nodemask_t *policy_nodes)
1835{
1836	info->root = RB_ROOT;
1837	spin_lock_init(&info->lock);
1838
1839	if (policy != MPOL_DEFAULT) {
1840		struct mempolicy *newpol;
1841
1842		/* Falls back to NULL policy [MPOL_DEFAULT] on any error */
1843		newpol = mpol_new(policy, flags, policy_nodes);
1844		if (!IS_ERR(newpol)) {
1845			/* Create pseudo-vma that contains just the policy */
1846			struct vm_area_struct pvma;
1847
1848			memset(&pvma, 0, sizeof(struct vm_area_struct));
1849			/* Policy covers entire file */
1850			pvma.vm_end = TASK_SIZE;
1851			mpol_set_shared_policy(info, &pvma, newpol);
1852			mpol_put(newpol);
1853		}
1854	}
1855}
1856
1857int mpol_set_shared_policy(struct shared_policy *info,
1858			struct vm_area_struct *vma, struct mempolicy *npol)
1859{
1860	int err;
1861	struct sp_node *new = NULL;
1862	unsigned long sz = vma_pages(vma);
1863
1864	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1865		 vma->vm_pgoff,
1866		 sz, npol ? npol->mode : -1,
1867		 npol ? npol->flags : -1,
1868		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1869
1870	if (npol) {
1871		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1872		if (!new)
1873			return -ENOMEM;
1874	}
1875	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1876	if (err && new)
1877		kmem_cache_free(sn_cache, new);
1878	return err;
1879}
1880
1881/* Free a backing policy store on inode delete. */
1882void mpol_free_shared_policy(struct shared_policy *p)
1883{
1884	struct sp_node *n;
1885	struct rb_node *next;
1886
1887	if (!p->root.rb_node)
1888		return;
1889	spin_lock(&p->lock);
1890	next = rb_first(&p->root);
1891	while (next) {
1892		n = rb_entry(next, struct sp_node, nd);
1893		next = rb_next(&n->nd);
1894		rb_erase(&n->nd, &p->root);
1895		mpol_put(n->policy);
1896		kmem_cache_free(sn_cache, n);
1897	}
1898	spin_unlock(&p->lock);
1899}
1900
1901/* assumes fs == KERNEL_DS */
1902void __init numa_policy_init(void)
1903{
1904	nodemask_t interleave_nodes;
1905	unsigned long largest = 0;
1906	int nid, prefer = 0;
1907
1908	policy_cache = kmem_cache_create("numa_policy",
1909					 sizeof(struct mempolicy),
1910					 0, SLAB_PANIC, NULL);
1911
1912	sn_cache = kmem_cache_create("shared_policy_node",
1913				     sizeof(struct sp_node),
1914				     0, SLAB_PANIC, NULL);
1915
1916	/*
1917	 * Set interleaving policy for system init. Interleaving is only
1918	 * enabled across suitably sized nodes (default is >= 16MB), or
1919	 * fall back to the largest node if they're all smaller.
1920	 */
1921	nodes_clear(interleave_nodes);
1922	for_each_node_state(nid, N_HIGH_MEMORY) {
1923		unsigned long total_pages = node_present_pages(nid);
1924
1925		/* Preserve the largest node */
1926		if (largest < total_pages) {
1927			largest = total_pages;
1928			prefer = nid;
1929		}
1930
1931		/* Interleave this node? */
1932		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1933			node_set(nid, interleave_nodes);
1934	}
1935
1936	/* All too small, use the largest */
1937	if (unlikely(nodes_empty(interleave_nodes)))
1938		node_set(prefer, interleave_nodes);
1939
1940	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1941		printk("numa_policy_init: interleaving failed\n");
1942}
1943
1944/* Reset policy of current process to default */
1945void numa_default_policy(void)
1946{
1947	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1948}
1949
1950/*
1951 * Display pages allocated per node and memory policy via /proc.
1952 */
1953static const char * const policy_types[] =
1954	{ "default", "prefer", "bind", "interleave" };
1955
1956/*
1957 * Convert a mempolicy into a string.
1958 * Returns the number of characters in buffer (if positive)
1959 * or an error (negative)
1960 */
1961static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1962{
1963	char *p = buffer;
1964	int l;
1965	nodemask_t nodes;
1966	unsigned short mode;
1967	unsigned short flags = pol ? pol->flags : 0;
1968
1969	if (!pol || pol == &default_policy)
1970		mode = MPOL_DEFAULT;
1971	else
1972		mode = pol->mode;
1973
1974	switch (mode) {
1975	case MPOL_DEFAULT:
1976		nodes_clear(nodes);
1977		break;
1978
1979	case MPOL_PREFERRED:
1980		nodes_clear(nodes);
1981		node_set(pol->v.preferred_node, nodes);
1982		break;
1983
1984	case MPOL_BIND:
1985		/* Fall through */
1986	case MPOL_INTERLEAVE:
1987		nodes = pol->v.nodes;
1988		break;
1989
1990	default:
1991		BUG();
1992		return -EFAULT;
1993	}
1994
1995	l = strlen(policy_types[mode]);
1996 	if (buffer + maxlen < p + l + 1)
1997 		return -ENOSPC;
1998
1999	strcpy(p, policy_types[mode]);
2000	p += l;
2001
2002	if (flags) {
2003		int need_bar = 0;
2004
2005		if (buffer + maxlen < p + 2)
2006			return -ENOSPC;
2007		*p++ = '=';
2008
2009		if (flags & MPOL_F_STATIC_NODES)
2010			p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
2011		if (flags & MPOL_F_RELATIVE_NODES)
2012			p += sprintf(p, "%srelative", need_bar++ ? "|" : "");
2013	}
2014
2015	if (!nodes_empty(nodes)) {
2016		if (buffer + maxlen < p + 2)
2017			return -ENOSPC;
2018		*p++ = '=';
2019	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2020	}
2021	return p - buffer;
2022}
2023
2024struct numa_maps {
2025	unsigned long pages;
2026	unsigned long anon;
2027	unsigned long active;
2028	unsigned long writeback;
2029	unsigned long mapcount_max;
2030	unsigned long dirty;
2031	unsigned long swapcache;
2032	unsigned long node[MAX_NUMNODES];
2033};
2034
2035static void gather_stats(struct page *page, void *private, int pte_dirty)
2036{
2037	struct numa_maps *md = private;
2038	int count = page_mapcount(page);
2039
2040	md->pages++;
2041	if (pte_dirty || PageDirty(page))
2042		md->dirty++;
2043
2044	if (PageSwapCache(page))
2045		md->swapcache++;
2046
2047	if (PageActive(page))
2048		md->active++;
2049
2050	if (PageWriteback(page))
2051		md->writeback++;
2052
2053	if (PageAnon(page))
2054		md->anon++;
2055
2056	if (count > md->mapcount_max)
2057		md->mapcount_max = count;
2058
2059	md->node[page_to_nid(page)]++;
2060}
2061
2062#ifdef CONFIG_HUGETLB_PAGE
2063static void check_huge_range(struct vm_area_struct *vma,
2064		unsigned long start, unsigned long end,
2065		struct numa_maps *md)
2066{
2067	unsigned long addr;
2068	struct page *page;
2069
2070	for (addr = start; addr < end; addr += HPAGE_SIZE) {
2071		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
2072		pte_t pte;
2073
2074		if (!ptep)
2075			continue;
2076
2077		pte = *ptep;
2078		if (pte_none(pte))
2079			continue;
2080
2081		page = pte_page(pte);
2082		if (!page)
2083			continue;
2084
2085		gather_stats(page, md, pte_dirty(*ptep));
2086	}
2087}
2088#else
2089static inline void check_huge_range(struct vm_area_struct *vma,
2090		unsigned long start, unsigned long end,
2091		struct numa_maps *md)
2092{
2093}
2094#endif
2095
2096int show_numa_map(struct seq_file *m, void *v)
2097{
2098	struct proc_maps_private *priv = m->private;
2099	struct vm_area_struct *vma = v;
2100	struct numa_maps *md;
2101	struct file *file = vma->vm_file;
2102	struct mm_struct *mm = vma->vm_mm;
2103	struct mempolicy *pol;
2104	int n;
2105	char buffer[50];
2106
2107	if (!mm)
2108		return 0;
2109
2110	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2111	if (!md)
2112		return 0;
2113
2114	pol = get_vma_policy(priv->task, vma, vma->vm_start);
2115	mpol_to_str(buffer, sizeof(buffer), pol);
2116	mpol_cond_put(pol);
2117
2118	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2119
2120	if (file) {
2121		seq_printf(m, " file=");
2122		seq_path(m, &file->f_path, "\n\t= ");
2123	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2124		seq_printf(m, " heap");
2125	} else if (vma->vm_start <= mm->start_stack &&
2126			vma->vm_end >= mm->start_stack) {
2127		seq_printf(m, " stack");
2128	}
2129
2130	if (is_vm_hugetlb_page(vma)) {
2131		check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2132		seq_printf(m, " huge");
2133	} else {
2134		check_pgd_range(vma, vma->vm_start, vma->vm_end,
2135			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2136	}
2137
2138	if (!md->pages)
2139		goto out;
2140
2141	if (md->anon)
2142		seq_printf(m," anon=%lu",md->anon);
2143
2144	if (md->dirty)
2145		seq_printf(m," dirty=%lu",md->dirty);
2146
2147	if (md->pages != md->anon && md->pages != md->dirty)
2148		seq_printf(m, " mapped=%lu", md->pages);
2149
2150	if (md->mapcount_max > 1)
2151		seq_printf(m, " mapmax=%lu", md->mapcount_max);
2152
2153	if (md->swapcache)
2154		seq_printf(m," swapcache=%lu", md->swapcache);
2155
2156	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2157		seq_printf(m," active=%lu", md->active);
2158
2159	if (md->writeback)
2160		seq_printf(m," writeback=%lu", md->writeback);
2161
2162	for_each_node_state(n, N_HIGH_MEMORY)
2163		if (md->node[n])
2164			seq_printf(m, " N%d=%lu", n, md->node[n]);
2165out:
2166	seq_putc(m, '\n');
2167	kfree(md);
2168
2169	if (m->count < m->size)
2170		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2171	return 0;
2172}
2173