migrate.c revision bda8550deed96687f29992d711a88ea21cff4d26
1/*
2 * Memory Migration functionality - linux/mm/migration.c
3 *
4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
5 *
6 * Page migration was first developed in the context of the memory hotplug
7 * project. The main authors of the migration code are:
8 *
9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
10 * Hirokazu Takahashi <taka@valinux.co.jp>
11 * Dave Hansen <haveblue@us.ibm.com>
12 * Christoph Lameter
13 */
14
15#include <linux/migrate.h>
16#include <linux/module.h>
17#include <linux/swap.h>
18#include <linux/swapops.h>
19#include <linux/pagemap.h>
20#include <linux/buffer_head.h>
21#include <linux/mm_inline.h>
22#include <linux/nsproxy.h>
23#include <linux/pagevec.h>
24#include <linux/rmap.h>
25#include <linux/topology.h>
26#include <linux/cpu.h>
27#include <linux/cpuset.h>
28#include <linux/writeback.h>
29#include <linux/mempolicy.h>
30#include <linux/vmalloc.h>
31#include <linux/security.h>
32#include <linux/memcontrol.h>
33#include <linux/syscalls.h>
34
35#include "internal.h"
36
37#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
38
39/*
40 * migrate_prep() needs to be called before we start compiling a list of pages
41 * to be migrated using isolate_lru_page().
42 */
43int migrate_prep(void)
44{
45	/*
46	 * Clear the LRU lists so pages can be isolated.
47	 * Note that pages may be moved off the LRU after we have
48	 * drained them. Those pages will fail to migrate like other
49	 * pages that may be busy.
50	 */
51	lru_add_drain_all();
52
53	return 0;
54}
55
56/*
57 * Add isolated pages on the list back to the LRU under page lock
58 * to avoid leaking evictable pages back onto unevictable list.
59 *
60 * returns the number of pages put back.
61 */
62int putback_lru_pages(struct list_head *l)
63{
64	struct page *page;
65	struct page *page2;
66	int count = 0;
67
68	list_for_each_entry_safe(page, page2, l, lru) {
69		list_del(&page->lru);
70		putback_lru_page(page);
71		count++;
72	}
73	return count;
74}
75
76/*
77 * Restore a potential migration pte to a working pte entry
78 */
79static void remove_migration_pte(struct vm_area_struct *vma,
80		struct page *old, struct page *new)
81{
82	struct mm_struct *mm = vma->vm_mm;
83	swp_entry_t entry;
84 	pgd_t *pgd;
85 	pud_t *pud;
86 	pmd_t *pmd;
87	pte_t *ptep, pte;
88 	spinlock_t *ptl;
89	unsigned long addr = page_address_in_vma(new, vma);
90
91	if (addr == -EFAULT)
92		return;
93
94 	pgd = pgd_offset(mm, addr);
95	if (!pgd_present(*pgd))
96                return;
97
98	pud = pud_offset(pgd, addr);
99	if (!pud_present(*pud))
100                return;
101
102	pmd = pmd_offset(pud, addr);
103	if (!pmd_present(*pmd))
104		return;
105
106	ptep = pte_offset_map(pmd, addr);
107
108	if (!is_swap_pte(*ptep)) {
109		pte_unmap(ptep);
110 		return;
111 	}
112
113 	ptl = pte_lockptr(mm, pmd);
114 	spin_lock(ptl);
115	pte = *ptep;
116	if (!is_swap_pte(pte))
117		goto out;
118
119	entry = pte_to_swp_entry(pte);
120
121	if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
122		goto out;
123
124	/*
125	 * Yes, ignore the return value from a GFP_ATOMIC mem_cgroup_charge.
126	 * Failure is not an option here: we're now expected to remove every
127	 * migration pte, and will cause crashes otherwise.  Normally this
128	 * is not an issue: mem_cgroup_prepare_migration bumped up the old
129	 * page_cgroup count for safety, that's now attached to the new page,
130	 * so this charge should just be another incrementation of the count,
131	 * to keep in balance with rmap.c's mem_cgroup_uncharging.  But if
132	 * there's been a force_empty, those reference counts may no longer
133	 * be reliable, and this charge can actually fail: oh well, we don't
134	 * make the situation any worse by proceeding as if it had succeeded.
135	 */
136	mem_cgroup_charge(new, mm, GFP_ATOMIC);
137
138	get_page(new);
139	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
140	if (is_write_migration_entry(entry))
141		pte = pte_mkwrite(pte);
142	flush_cache_page(vma, addr, pte_pfn(pte));
143	set_pte_at(mm, addr, ptep, pte);
144
145	if (PageAnon(new))
146		page_add_anon_rmap(new, vma, addr);
147	else
148		page_add_file_rmap(new);
149
150	/* No need to invalidate - it was non-present before */
151	update_mmu_cache(vma, addr, pte);
152
153out:
154	pte_unmap_unlock(ptep, ptl);
155}
156
157/*
158 * Note that remove_file_migration_ptes will only work on regular mappings,
159 * Nonlinear mappings do not use migration entries.
160 */
161static void remove_file_migration_ptes(struct page *old, struct page *new)
162{
163	struct vm_area_struct *vma;
164	struct address_space *mapping = page_mapping(new);
165	struct prio_tree_iter iter;
166	pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
167
168	if (!mapping)
169		return;
170
171	spin_lock(&mapping->i_mmap_lock);
172
173	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
174		remove_migration_pte(vma, old, new);
175
176	spin_unlock(&mapping->i_mmap_lock);
177}
178
179/*
180 * Must hold mmap_sem lock on at least one of the vmas containing
181 * the page so that the anon_vma cannot vanish.
182 */
183static void remove_anon_migration_ptes(struct page *old, struct page *new)
184{
185	struct anon_vma *anon_vma;
186	struct vm_area_struct *vma;
187	unsigned long mapping;
188
189	mapping = (unsigned long)new->mapping;
190
191	if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
192		return;
193
194	/*
195	 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
196	 */
197	anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
198	spin_lock(&anon_vma->lock);
199
200	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
201		remove_migration_pte(vma, old, new);
202
203	spin_unlock(&anon_vma->lock);
204}
205
206/*
207 * Get rid of all migration entries and replace them by
208 * references to the indicated page.
209 */
210static void remove_migration_ptes(struct page *old, struct page *new)
211{
212	if (PageAnon(new))
213		remove_anon_migration_ptes(old, new);
214	else
215		remove_file_migration_ptes(old, new);
216}
217
218/*
219 * Something used the pte of a page under migration. We need to
220 * get to the page and wait until migration is finished.
221 * When we return from this function the fault will be retried.
222 *
223 * This function is called from do_swap_page().
224 */
225void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
226				unsigned long address)
227{
228	pte_t *ptep, pte;
229	spinlock_t *ptl;
230	swp_entry_t entry;
231	struct page *page;
232
233	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
234	pte = *ptep;
235	if (!is_swap_pte(pte))
236		goto out;
237
238	entry = pte_to_swp_entry(pte);
239	if (!is_migration_entry(entry))
240		goto out;
241
242	page = migration_entry_to_page(entry);
243
244	/*
245	 * Once radix-tree replacement of page migration started, page_count
246	 * *must* be zero. And, we don't want to call wait_on_page_locked()
247	 * against a page without get_page().
248	 * So, we use get_page_unless_zero(), here. Even failed, page fault
249	 * will occur again.
250	 */
251	if (!get_page_unless_zero(page))
252		goto out;
253	pte_unmap_unlock(ptep, ptl);
254	wait_on_page_locked(page);
255	put_page(page);
256	return;
257out:
258	pte_unmap_unlock(ptep, ptl);
259}
260
261/*
262 * Replace the page in the mapping.
263 *
264 * The number of remaining references must be:
265 * 1 for anonymous pages without a mapping
266 * 2 for pages with a mapping
267 * 3 for pages with a mapping and PagePrivate set.
268 */
269static int migrate_page_move_mapping(struct address_space *mapping,
270		struct page *newpage, struct page *page)
271{
272	int expected_count;
273	void **pslot;
274
275	if (!mapping) {
276		/* Anonymous page without mapping */
277		if (page_count(page) != 1)
278			return -EAGAIN;
279		return 0;
280	}
281
282	spin_lock_irq(&mapping->tree_lock);
283
284	pslot = radix_tree_lookup_slot(&mapping->page_tree,
285 					page_index(page));
286
287	expected_count = 2 + !!PagePrivate(page);
288	if (page_count(page) != expected_count ||
289			(struct page *)radix_tree_deref_slot(pslot) != page) {
290		spin_unlock_irq(&mapping->tree_lock);
291		return -EAGAIN;
292	}
293
294	if (!page_freeze_refs(page, expected_count)) {
295		spin_unlock_irq(&mapping->tree_lock);
296		return -EAGAIN;
297	}
298
299	/*
300	 * Now we know that no one else is looking at the page.
301	 */
302	get_page(newpage);	/* add cache reference */
303#ifdef CONFIG_SWAP
304	if (PageSwapCache(page)) {
305		SetPageSwapCache(newpage);
306		set_page_private(newpage, page_private(page));
307	}
308#endif
309
310	radix_tree_replace_slot(pslot, newpage);
311
312	page_unfreeze_refs(page, expected_count);
313	/*
314	 * Drop cache reference from old page.
315	 * We know this isn't the last reference.
316	 */
317	__put_page(page);
318
319	/*
320	 * If moved to a different zone then also account
321	 * the page for that zone. Other VM counters will be
322	 * taken care of when we establish references to the
323	 * new page and drop references to the old page.
324	 *
325	 * Note that anonymous pages are accounted for
326	 * via NR_FILE_PAGES and NR_ANON_PAGES if they
327	 * are mapped to swap space.
328	 */
329	__dec_zone_page_state(page, NR_FILE_PAGES);
330	__inc_zone_page_state(newpage, NR_FILE_PAGES);
331
332	spin_unlock_irq(&mapping->tree_lock);
333
334	return 0;
335}
336
337/*
338 * Copy the page to its new location
339 */
340static void migrate_page_copy(struct page *newpage, struct page *page)
341{
342	int anon;
343
344	copy_highpage(newpage, page);
345
346	if (PageError(page))
347		SetPageError(newpage);
348	if (PageReferenced(page))
349		SetPageReferenced(newpage);
350	if (PageUptodate(page))
351		SetPageUptodate(newpage);
352	if (TestClearPageActive(page)) {
353		VM_BUG_ON(PageUnevictable(page));
354		SetPageActive(newpage);
355	} else
356		unevictable_migrate_page(newpage, page);
357	if (PageChecked(page))
358		SetPageChecked(newpage);
359	if (PageMappedToDisk(page))
360		SetPageMappedToDisk(newpage);
361
362	if (PageDirty(page)) {
363		clear_page_dirty_for_io(page);
364		/*
365		 * Want to mark the page and the radix tree as dirty, and
366		 * redo the accounting that clear_page_dirty_for_io undid,
367		 * but we can't use set_page_dirty because that function
368		 * is actually a signal that all of the page has become dirty.
369		 * Wheras only part of our page may be dirty.
370		 */
371		__set_page_dirty_nobuffers(newpage);
372 	}
373
374	mlock_migrate_page(newpage, page);
375
376#ifdef CONFIG_SWAP
377	ClearPageSwapCache(page);
378#endif
379	ClearPagePrivate(page);
380	set_page_private(page, 0);
381	/* page->mapping contains a flag for PageAnon() */
382	anon = PageAnon(page);
383	page->mapping = NULL;
384
385	if (!anon) /* This page was removed from radix-tree. */
386		mem_cgroup_uncharge_cache_page(page);
387
388	/*
389	 * If any waiters have accumulated on the new page then
390	 * wake them up.
391	 */
392	if (PageWriteback(newpage))
393		end_page_writeback(newpage);
394}
395
396/************************************************************
397 *                    Migration functions
398 ***********************************************************/
399
400/* Always fail migration. Used for mappings that are not movable */
401int fail_migrate_page(struct address_space *mapping,
402			struct page *newpage, struct page *page)
403{
404	return -EIO;
405}
406EXPORT_SYMBOL(fail_migrate_page);
407
408/*
409 * Common logic to directly migrate a single page suitable for
410 * pages that do not use PagePrivate.
411 *
412 * Pages are locked upon entry and exit.
413 */
414int migrate_page(struct address_space *mapping,
415		struct page *newpage, struct page *page)
416{
417	int rc;
418
419	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
420
421	rc = migrate_page_move_mapping(mapping, newpage, page);
422
423	if (rc)
424		return rc;
425
426	migrate_page_copy(newpage, page);
427	return 0;
428}
429EXPORT_SYMBOL(migrate_page);
430
431#ifdef CONFIG_BLOCK
432/*
433 * Migration function for pages with buffers. This function can only be used
434 * if the underlying filesystem guarantees that no other references to "page"
435 * exist.
436 */
437int buffer_migrate_page(struct address_space *mapping,
438		struct page *newpage, struct page *page)
439{
440	struct buffer_head *bh, *head;
441	int rc;
442
443	if (!page_has_buffers(page))
444		return migrate_page(mapping, newpage, page);
445
446	head = page_buffers(page);
447
448	rc = migrate_page_move_mapping(mapping, newpage, page);
449
450	if (rc)
451		return rc;
452
453	bh = head;
454	do {
455		get_bh(bh);
456		lock_buffer(bh);
457		bh = bh->b_this_page;
458
459	} while (bh != head);
460
461	ClearPagePrivate(page);
462	set_page_private(newpage, page_private(page));
463	set_page_private(page, 0);
464	put_page(page);
465	get_page(newpage);
466
467	bh = head;
468	do {
469		set_bh_page(bh, newpage, bh_offset(bh));
470		bh = bh->b_this_page;
471
472	} while (bh != head);
473
474	SetPagePrivate(newpage);
475
476	migrate_page_copy(newpage, page);
477
478	bh = head;
479	do {
480		unlock_buffer(bh);
481 		put_bh(bh);
482		bh = bh->b_this_page;
483
484	} while (bh != head);
485
486	return 0;
487}
488EXPORT_SYMBOL(buffer_migrate_page);
489#endif
490
491/*
492 * Writeback a page to clean the dirty state
493 */
494static int writeout(struct address_space *mapping, struct page *page)
495{
496	struct writeback_control wbc = {
497		.sync_mode = WB_SYNC_NONE,
498		.nr_to_write = 1,
499		.range_start = 0,
500		.range_end = LLONG_MAX,
501		.nonblocking = 1,
502		.for_reclaim = 1
503	};
504	int rc;
505
506	if (!mapping->a_ops->writepage)
507		/* No write method for the address space */
508		return -EINVAL;
509
510	if (!clear_page_dirty_for_io(page))
511		/* Someone else already triggered a write */
512		return -EAGAIN;
513
514	/*
515	 * A dirty page may imply that the underlying filesystem has
516	 * the page on some queue. So the page must be clean for
517	 * migration. Writeout may mean we loose the lock and the
518	 * page state is no longer what we checked for earlier.
519	 * At this point we know that the migration attempt cannot
520	 * be successful.
521	 */
522	remove_migration_ptes(page, page);
523
524	rc = mapping->a_ops->writepage(page, &wbc);
525
526	if (rc != AOP_WRITEPAGE_ACTIVATE)
527		/* unlocked. Relock */
528		lock_page(page);
529
530	return (rc < 0) ? -EIO : -EAGAIN;
531}
532
533/*
534 * Default handling if a filesystem does not provide a migration function.
535 */
536static int fallback_migrate_page(struct address_space *mapping,
537	struct page *newpage, struct page *page)
538{
539	if (PageDirty(page))
540		return writeout(mapping, page);
541
542	/*
543	 * Buffers may be managed in a filesystem specific way.
544	 * We must have no buffers or drop them.
545	 */
546	if (PagePrivate(page) &&
547	    !try_to_release_page(page, GFP_KERNEL))
548		return -EAGAIN;
549
550	return migrate_page(mapping, newpage, page);
551}
552
553/*
554 * Move a page to a newly allocated page
555 * The page is locked and all ptes have been successfully removed.
556 *
557 * The new page will have replaced the old page if this function
558 * is successful.
559 *
560 * Return value:
561 *   < 0 - error code
562 *  == 0 - success
563 */
564static int move_to_new_page(struct page *newpage, struct page *page)
565{
566	struct address_space *mapping;
567	int rc;
568
569	/*
570	 * Block others from accessing the page when we get around to
571	 * establishing additional references. We are the only one
572	 * holding a reference to the new page at this point.
573	 */
574	if (!trylock_page(newpage))
575		BUG();
576
577	/* Prepare mapping for the new page.*/
578	newpage->index = page->index;
579	newpage->mapping = page->mapping;
580	if (PageSwapBacked(page))
581		SetPageSwapBacked(newpage);
582
583	mapping = page_mapping(page);
584	if (!mapping)
585		rc = migrate_page(mapping, newpage, page);
586	else if (mapping->a_ops->migratepage)
587		/*
588		 * Most pages have a mapping and most filesystems
589		 * should provide a migration function. Anonymous
590		 * pages are part of swap space which also has its
591		 * own migration function. This is the most common
592		 * path for page migration.
593		 */
594		rc = mapping->a_ops->migratepage(mapping,
595						newpage, page);
596	else
597		rc = fallback_migrate_page(mapping, newpage, page);
598
599	if (!rc) {
600		remove_migration_ptes(page, newpage);
601	} else
602		newpage->mapping = NULL;
603
604	unlock_page(newpage);
605
606	return rc;
607}
608
609/*
610 * Obtain the lock on page, remove all ptes and migrate the page
611 * to the newly allocated page in newpage.
612 */
613static int unmap_and_move(new_page_t get_new_page, unsigned long private,
614			struct page *page, int force)
615{
616	int rc = 0;
617	int *result = NULL;
618	struct page *newpage = get_new_page(page, private, &result);
619	int rcu_locked = 0;
620	int charge = 0;
621
622	if (!newpage)
623		return -ENOMEM;
624
625	if (page_count(page) == 1) {
626		/* page was freed from under us. So we are done. */
627		goto move_newpage;
628	}
629
630	charge = mem_cgroup_prepare_migration(page, newpage);
631	if (charge == -ENOMEM) {
632		rc = -ENOMEM;
633		goto move_newpage;
634	}
635	/* prepare cgroup just returns 0 or -ENOMEM */
636	BUG_ON(charge);
637
638	rc = -EAGAIN;
639	if (!trylock_page(page)) {
640		if (!force)
641			goto move_newpage;
642		lock_page(page);
643	}
644
645	if (PageWriteback(page)) {
646		if (!force)
647			goto unlock;
648		wait_on_page_writeback(page);
649	}
650	/*
651	 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
652	 * we cannot notice that anon_vma is freed while we migrates a page.
653	 * This rcu_read_lock() delays freeing anon_vma pointer until the end
654	 * of migration. File cache pages are no problem because of page_lock()
655	 * File Caches may use write_page() or lock_page() in migration, then,
656	 * just care Anon page here.
657	 */
658	if (PageAnon(page)) {
659		rcu_read_lock();
660		rcu_locked = 1;
661	}
662
663	/*
664	 * Corner case handling:
665	 * 1. When a new swap-cache page is read into, it is added to the LRU
666	 * and treated as swapcache but it has no rmap yet.
667	 * Calling try_to_unmap() against a page->mapping==NULL page will
668	 * trigger a BUG.  So handle it here.
669	 * 2. An orphaned page (see truncate_complete_page) might have
670	 * fs-private metadata. The page can be picked up due to memory
671	 * offlining.  Everywhere else except page reclaim, the page is
672	 * invisible to the vm, so the page can not be migrated.  So try to
673	 * free the metadata, so the page can be freed.
674	 */
675	if (!page->mapping) {
676		if (!PageAnon(page) && PagePrivate(page)) {
677			/*
678			 * Go direct to try_to_free_buffers() here because
679			 * a) that's what try_to_release_page() would do anyway
680			 * b) we may be under rcu_read_lock() here, so we can't
681			 *    use GFP_KERNEL which is what try_to_release_page()
682			 *    needs to be effective.
683			 */
684			try_to_free_buffers(page);
685		}
686		goto rcu_unlock;
687	}
688
689	/* Establish migration ptes or remove ptes */
690	try_to_unmap(page, 1);
691
692	if (!page_mapped(page))
693		rc = move_to_new_page(newpage, page);
694
695	if (rc)
696		remove_migration_ptes(page, page);
697rcu_unlock:
698	if (rcu_locked)
699		rcu_read_unlock();
700
701unlock:
702	unlock_page(page);
703
704	if (rc != -EAGAIN) {
705 		/*
706 		 * A page that has been migrated has all references
707 		 * removed and will be freed. A page that has not been
708 		 * migrated will have kepts its references and be
709 		 * restored.
710 		 */
711 		list_del(&page->lru);
712		putback_lru_page(page);
713	}
714
715move_newpage:
716	if (!charge)
717		mem_cgroup_end_migration(newpage);
718
719	/*
720	 * Move the new page to the LRU. If migration was not successful
721	 * then this will free the page.
722	 */
723	putback_lru_page(newpage);
724
725	if (result) {
726		if (rc)
727			*result = rc;
728		else
729			*result = page_to_nid(newpage);
730	}
731	return rc;
732}
733
734/*
735 * migrate_pages
736 *
737 * The function takes one list of pages to migrate and a function
738 * that determines from the page to be migrated and the private data
739 * the target of the move and allocates the page.
740 *
741 * The function returns after 10 attempts or if no pages
742 * are movable anymore because to has become empty
743 * or no retryable pages exist anymore. All pages will be
744 * returned to the LRU or freed.
745 *
746 * Return: Number of pages not migrated or error code.
747 */
748int migrate_pages(struct list_head *from,
749		new_page_t get_new_page, unsigned long private)
750{
751	int retry = 1;
752	int nr_failed = 0;
753	int pass = 0;
754	struct page *page;
755	struct page *page2;
756	int swapwrite = current->flags & PF_SWAPWRITE;
757	int rc;
758
759	if (!swapwrite)
760		current->flags |= PF_SWAPWRITE;
761
762	for(pass = 0; pass < 10 && retry; pass++) {
763		retry = 0;
764
765		list_for_each_entry_safe(page, page2, from, lru) {
766			cond_resched();
767
768			rc = unmap_and_move(get_new_page, private,
769						page, pass > 2);
770
771			switch(rc) {
772			case -ENOMEM:
773				goto out;
774			case -EAGAIN:
775				retry++;
776				break;
777			case 0:
778				break;
779			default:
780				/* Permanent failure */
781				nr_failed++;
782				break;
783			}
784		}
785	}
786	rc = 0;
787out:
788	if (!swapwrite)
789		current->flags &= ~PF_SWAPWRITE;
790
791	putback_lru_pages(from);
792
793	if (rc)
794		return rc;
795
796	return nr_failed + retry;
797}
798
799#ifdef CONFIG_NUMA
800/*
801 * Move a list of individual pages
802 */
803struct page_to_node {
804	unsigned long addr;
805	struct page *page;
806	int node;
807	int status;
808};
809
810static struct page *new_page_node(struct page *p, unsigned long private,
811		int **result)
812{
813	struct page_to_node *pm = (struct page_to_node *)private;
814
815	while (pm->node != MAX_NUMNODES && pm->page != p)
816		pm++;
817
818	if (pm->node == MAX_NUMNODES)
819		return NULL;
820
821	*result = &pm->status;
822
823	return alloc_pages_node(pm->node,
824				GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
825}
826
827/*
828 * Move a set of pages as indicated in the pm array. The addr
829 * field must be set to the virtual address of the page to be moved
830 * and the node number must contain a valid target node.
831 * The pm array ends with node = MAX_NUMNODES.
832 */
833static int do_move_page_to_node_array(struct mm_struct *mm,
834				      struct page_to_node *pm,
835				      int migrate_all)
836{
837	int err;
838	struct page_to_node *pp;
839	LIST_HEAD(pagelist);
840
841	migrate_prep();
842	down_read(&mm->mmap_sem);
843
844	/*
845	 * Build a list of pages to migrate
846	 */
847	for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
848		struct vm_area_struct *vma;
849		struct page *page;
850
851		/*
852		 * A valid page pointer that will not match any of the
853		 * pages that will be moved.
854		 */
855		pp->page = ZERO_PAGE(0);
856
857		err = -EFAULT;
858		vma = find_vma(mm, pp->addr);
859		if (!vma || !vma_migratable(vma))
860			goto set_status;
861
862		page = follow_page(vma, pp->addr, FOLL_GET);
863
864		err = PTR_ERR(page);
865		if (IS_ERR(page))
866			goto set_status;
867
868		err = -ENOENT;
869		if (!page)
870			goto set_status;
871
872		if (PageReserved(page))		/* Check for zero page */
873			goto put_and_set;
874
875		pp->page = page;
876		err = page_to_nid(page);
877
878		if (err == pp->node)
879			/*
880			 * Node already in the right place
881			 */
882			goto put_and_set;
883
884		err = -EACCES;
885		if (page_mapcount(page) > 1 &&
886				!migrate_all)
887			goto put_and_set;
888
889		err = isolate_lru_page(page);
890		if (!err)
891			list_add_tail(&page->lru, &pagelist);
892put_and_set:
893		/*
894		 * Either remove the duplicate refcount from
895		 * isolate_lru_page() or drop the page ref if it was
896		 * not isolated.
897		 */
898		put_page(page);
899set_status:
900		pp->status = err;
901	}
902
903	err = 0;
904	if (!list_empty(&pagelist))
905		err = migrate_pages(&pagelist, new_page_node,
906				(unsigned long)pm);
907
908	up_read(&mm->mmap_sem);
909	return err;
910}
911
912/*
913 * Migrate an array of page address onto an array of nodes and fill
914 * the corresponding array of status.
915 */
916static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
917			 unsigned long nr_pages,
918			 const void __user * __user *pages,
919			 const int __user *nodes,
920			 int __user *status, int flags)
921{
922	struct page_to_node *pm = NULL;
923	nodemask_t task_nodes;
924	int err = 0;
925	int i;
926
927	task_nodes = cpuset_mems_allowed(task);
928
929	/* Limit nr_pages so that the multiplication may not overflow */
930	if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
931		err = -E2BIG;
932		goto out;
933	}
934
935	pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
936	if (!pm) {
937		err = -ENOMEM;
938		goto out;
939	}
940
941	/*
942	 * Get parameters from user space and initialize the pm
943	 * array. Return various errors if the user did something wrong.
944	 */
945	for (i = 0; i < nr_pages; i++) {
946		const void __user *p;
947
948		err = -EFAULT;
949		if (get_user(p, pages + i))
950			goto out_pm;
951
952		pm[i].addr = (unsigned long)p;
953		if (nodes) {
954			int node;
955
956			if (get_user(node, nodes + i))
957				goto out_pm;
958
959			err = -ENODEV;
960			if (!node_state(node, N_HIGH_MEMORY))
961				goto out_pm;
962
963			err = -EACCES;
964			if (!node_isset(node, task_nodes))
965				goto out_pm;
966
967			pm[i].node = node;
968		} else
969			pm[i].node = 0;	/* anything to not match MAX_NUMNODES */
970	}
971	/* End marker */
972	pm[nr_pages].node = MAX_NUMNODES;
973
974	err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL);
975	if (err >= 0)
976		/* Return status information */
977		for (i = 0; i < nr_pages; i++)
978			if (put_user(pm[i].status, status + i))
979				err = -EFAULT;
980
981out_pm:
982	vfree(pm);
983out:
984	return err;
985}
986
987/*
988 * Determine the nodes of an array of pages and store it in an array of status.
989 */
990static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
991			 const void __user * __user *pages,
992			 int __user *status)
993{
994	unsigned long i;
995	int err;
996
997	down_read(&mm->mmap_sem);
998
999	for (i = 0; i < nr_pages; i++) {
1000		const void __user *p;
1001		unsigned long addr;
1002		struct vm_area_struct *vma;
1003		struct page *page;
1004
1005		err = -EFAULT;
1006		if (get_user(p, pages+i))
1007			goto out;
1008		addr = (unsigned long) p;
1009
1010		vma = find_vma(mm, addr);
1011		if (!vma)
1012			goto set_status;
1013
1014		page = follow_page(vma, addr, 0);
1015
1016		err = PTR_ERR(page);
1017		if (IS_ERR(page))
1018			goto set_status;
1019
1020		err = -ENOENT;
1021		/* Use PageReserved to check for zero page */
1022		if (!page || PageReserved(page))
1023			goto set_status;
1024
1025		err = page_to_nid(page);
1026set_status:
1027		put_user(err, status+i);
1028	}
1029	err = 0;
1030
1031out:
1032	up_read(&mm->mmap_sem);
1033	return err;
1034}
1035
1036/*
1037 * Move a list of pages in the address space of the currently executing
1038 * process.
1039 */
1040asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
1041			const void __user * __user *pages,
1042			const int __user *nodes,
1043			int __user *status, int flags)
1044{
1045	struct task_struct *task;
1046	struct mm_struct *mm;
1047	int err;
1048
1049	/* Check flags */
1050	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
1051		return -EINVAL;
1052
1053	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1054		return -EPERM;
1055
1056	/* Find the mm_struct */
1057	read_lock(&tasklist_lock);
1058	task = pid ? find_task_by_vpid(pid) : current;
1059	if (!task) {
1060		read_unlock(&tasklist_lock);
1061		return -ESRCH;
1062	}
1063	mm = get_task_mm(task);
1064	read_unlock(&tasklist_lock);
1065
1066	if (!mm)
1067		return -EINVAL;
1068
1069	/*
1070	 * Check if this process has the right to modify the specified
1071	 * process. The right exists if the process has administrative
1072	 * capabilities, superuser privileges or the same
1073	 * userid as the target process.
1074	 */
1075	if ((current->euid != task->suid) && (current->euid != task->uid) &&
1076	    (current->uid != task->suid) && (current->uid != task->uid) &&
1077	    !capable(CAP_SYS_NICE)) {
1078		err = -EPERM;
1079		goto out;
1080	}
1081
1082 	err = security_task_movememory(task);
1083 	if (err)
1084		goto out;
1085
1086	if (nodes) {
1087		err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
1088				    flags);
1089	} else {
1090		err = do_pages_stat(mm, nr_pages, pages, status);
1091	}
1092
1093out:
1094	mmput(mm);
1095	return err;
1096}
1097
1098/*
1099 * Call migration functions in the vma_ops that may prepare
1100 * memory in a vm for migration. migration functions may perform
1101 * the migration for vmas that do not have an underlying page struct.
1102 */
1103int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1104	const nodemask_t *from, unsigned long flags)
1105{
1106 	struct vm_area_struct *vma;
1107 	int err = 0;
1108
1109 	for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) {
1110 		if (vma->vm_ops && vma->vm_ops->migrate) {
1111 			err = vma->vm_ops->migrate(vma, to, from, flags);
1112 			if (err)
1113 				break;
1114 		}
1115 	}
1116 	return err;
1117}
1118#endif
1119