migrate.c revision e78bbfa8262424417a29349a8064a535053912b9
1/*
2 * Memory Migration functionality - linux/mm/migration.c
3 *
4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
5 *
6 * Page migration was first developed in the context of the memory hotplug
7 * project. The main authors of the migration code are:
8 *
9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
10 * Hirokazu Takahashi <taka@valinux.co.jp>
11 * Dave Hansen <haveblue@us.ibm.com>
12 * Christoph Lameter
13 */
14
15#include <linux/migrate.h>
16#include <linux/module.h>
17#include <linux/swap.h>
18#include <linux/swapops.h>
19#include <linux/pagemap.h>
20#include <linux/buffer_head.h>
21#include <linux/mm_inline.h>
22#include <linux/nsproxy.h>
23#include <linux/pagevec.h>
24#include <linux/rmap.h>
25#include <linux/topology.h>
26#include <linux/cpu.h>
27#include <linux/cpuset.h>
28#include <linux/writeback.h>
29#include <linux/mempolicy.h>
30#include <linux/vmalloc.h>
31#include <linux/security.h>
32#include <linux/memcontrol.h>
33#include <linux/syscalls.h>
34
35#include "internal.h"
36
37#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
38
39/*
40 * migrate_prep() needs to be called before we start compiling a list of pages
41 * to be migrated using isolate_lru_page().
42 */
43int migrate_prep(void)
44{
45	/*
46	 * Clear the LRU lists so pages can be isolated.
47	 * Note that pages may be moved off the LRU after we have
48	 * drained them. Those pages will fail to migrate like other
49	 * pages that may be busy.
50	 */
51	lru_add_drain_all();
52
53	return 0;
54}
55
56/*
57 * Add isolated pages on the list back to the LRU under page lock
58 * to avoid leaking evictable pages back onto unevictable list.
59 *
60 * returns the number of pages put back.
61 */
62int putback_lru_pages(struct list_head *l)
63{
64	struct page *page;
65	struct page *page2;
66	int count = 0;
67
68	list_for_each_entry_safe(page, page2, l, lru) {
69		list_del(&page->lru);
70		putback_lru_page(page);
71		count++;
72	}
73	return count;
74}
75
76/*
77 * Restore a potential migration pte to a working pte entry
78 */
79static void remove_migration_pte(struct vm_area_struct *vma,
80		struct page *old, struct page *new)
81{
82	struct mm_struct *mm = vma->vm_mm;
83	swp_entry_t entry;
84 	pgd_t *pgd;
85 	pud_t *pud;
86 	pmd_t *pmd;
87	pte_t *ptep, pte;
88 	spinlock_t *ptl;
89	unsigned long addr = page_address_in_vma(new, vma);
90
91	if (addr == -EFAULT)
92		return;
93
94 	pgd = pgd_offset(mm, addr);
95	if (!pgd_present(*pgd))
96                return;
97
98	pud = pud_offset(pgd, addr);
99	if (!pud_present(*pud))
100                return;
101
102	pmd = pmd_offset(pud, addr);
103	if (!pmd_present(*pmd))
104		return;
105
106	ptep = pte_offset_map(pmd, addr);
107
108	if (!is_swap_pte(*ptep)) {
109		pte_unmap(ptep);
110 		return;
111 	}
112
113 	ptl = pte_lockptr(mm, pmd);
114 	spin_lock(ptl);
115	pte = *ptep;
116	if (!is_swap_pte(pte))
117		goto out;
118
119	entry = pte_to_swp_entry(pte);
120
121	if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
122		goto out;
123
124	/*
125	 * Yes, ignore the return value from a GFP_ATOMIC mem_cgroup_charge.
126	 * Failure is not an option here: we're now expected to remove every
127	 * migration pte, and will cause crashes otherwise.  Normally this
128	 * is not an issue: mem_cgroup_prepare_migration bumped up the old
129	 * page_cgroup count for safety, that's now attached to the new page,
130	 * so this charge should just be another incrementation of the count,
131	 * to keep in balance with rmap.c's mem_cgroup_uncharging.  But if
132	 * there's been a force_empty, those reference counts may no longer
133	 * be reliable, and this charge can actually fail: oh well, we don't
134	 * make the situation any worse by proceeding as if it had succeeded.
135	 */
136	mem_cgroup_charge(new, mm, GFP_ATOMIC);
137
138	get_page(new);
139	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
140	if (is_write_migration_entry(entry))
141		pte = pte_mkwrite(pte);
142	flush_cache_page(vma, addr, pte_pfn(pte));
143	set_pte_at(mm, addr, ptep, pte);
144
145	if (PageAnon(new))
146		page_add_anon_rmap(new, vma, addr);
147	else
148		page_add_file_rmap(new);
149
150	/* No need to invalidate - it was non-present before */
151	update_mmu_cache(vma, addr, pte);
152
153out:
154	pte_unmap_unlock(ptep, ptl);
155}
156
157/*
158 * Note that remove_file_migration_ptes will only work on regular mappings,
159 * Nonlinear mappings do not use migration entries.
160 */
161static void remove_file_migration_ptes(struct page *old, struct page *new)
162{
163	struct vm_area_struct *vma;
164	struct address_space *mapping = page_mapping(new);
165	struct prio_tree_iter iter;
166	pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
167
168	if (!mapping)
169		return;
170
171	spin_lock(&mapping->i_mmap_lock);
172
173	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
174		remove_migration_pte(vma, old, new);
175
176	spin_unlock(&mapping->i_mmap_lock);
177}
178
179/*
180 * Must hold mmap_sem lock on at least one of the vmas containing
181 * the page so that the anon_vma cannot vanish.
182 */
183static void remove_anon_migration_ptes(struct page *old, struct page *new)
184{
185	struct anon_vma *anon_vma;
186	struct vm_area_struct *vma;
187	unsigned long mapping;
188
189	mapping = (unsigned long)new->mapping;
190
191	if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
192		return;
193
194	/*
195	 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
196	 */
197	anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
198	spin_lock(&anon_vma->lock);
199
200	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
201		remove_migration_pte(vma, old, new);
202
203	spin_unlock(&anon_vma->lock);
204}
205
206/*
207 * Get rid of all migration entries and replace them by
208 * references to the indicated page.
209 */
210static void remove_migration_ptes(struct page *old, struct page *new)
211{
212	if (PageAnon(new))
213		remove_anon_migration_ptes(old, new);
214	else
215		remove_file_migration_ptes(old, new);
216}
217
218/*
219 * Something used the pte of a page under migration. We need to
220 * get to the page and wait until migration is finished.
221 * When we return from this function the fault will be retried.
222 *
223 * This function is called from do_swap_page().
224 */
225void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
226				unsigned long address)
227{
228	pte_t *ptep, pte;
229	spinlock_t *ptl;
230	swp_entry_t entry;
231	struct page *page;
232
233	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
234	pte = *ptep;
235	if (!is_swap_pte(pte))
236		goto out;
237
238	entry = pte_to_swp_entry(pte);
239	if (!is_migration_entry(entry))
240		goto out;
241
242	page = migration_entry_to_page(entry);
243
244	/*
245	 * Once radix-tree replacement of page migration started, page_count
246	 * *must* be zero. And, we don't want to call wait_on_page_locked()
247	 * against a page without get_page().
248	 * So, we use get_page_unless_zero(), here. Even failed, page fault
249	 * will occur again.
250	 */
251	if (!get_page_unless_zero(page))
252		goto out;
253	pte_unmap_unlock(ptep, ptl);
254	wait_on_page_locked(page);
255	put_page(page);
256	return;
257out:
258	pte_unmap_unlock(ptep, ptl);
259}
260
261/*
262 * Replace the page in the mapping.
263 *
264 * The number of remaining references must be:
265 * 1 for anonymous pages without a mapping
266 * 2 for pages with a mapping
267 * 3 for pages with a mapping and PagePrivate set.
268 */
269static int migrate_page_move_mapping(struct address_space *mapping,
270		struct page *newpage, struct page *page)
271{
272	int expected_count;
273	void **pslot;
274
275	if (!mapping) {
276		/* Anonymous page without mapping */
277		if (page_count(page) != 1)
278			return -EAGAIN;
279		return 0;
280	}
281
282	spin_lock_irq(&mapping->tree_lock);
283
284	pslot = radix_tree_lookup_slot(&mapping->page_tree,
285 					page_index(page));
286
287	expected_count = 2 + !!PagePrivate(page);
288	if (page_count(page) != expected_count ||
289			(struct page *)radix_tree_deref_slot(pslot) != page) {
290		spin_unlock_irq(&mapping->tree_lock);
291		return -EAGAIN;
292	}
293
294	if (!page_freeze_refs(page, expected_count)) {
295		spin_unlock_irq(&mapping->tree_lock);
296		return -EAGAIN;
297	}
298
299	/*
300	 * Now we know that no one else is looking at the page.
301	 */
302	get_page(newpage);	/* add cache reference */
303#ifdef CONFIG_SWAP
304	if (PageSwapCache(page)) {
305		SetPageSwapCache(newpage);
306		set_page_private(newpage, page_private(page));
307	}
308#endif
309
310	radix_tree_replace_slot(pslot, newpage);
311
312	page_unfreeze_refs(page, expected_count);
313	/*
314	 * Drop cache reference from old page.
315	 * We know this isn't the last reference.
316	 */
317	__put_page(page);
318
319	/*
320	 * If moved to a different zone then also account
321	 * the page for that zone. Other VM counters will be
322	 * taken care of when we establish references to the
323	 * new page and drop references to the old page.
324	 *
325	 * Note that anonymous pages are accounted for
326	 * via NR_FILE_PAGES and NR_ANON_PAGES if they
327	 * are mapped to swap space.
328	 */
329	__dec_zone_page_state(page, NR_FILE_PAGES);
330	__inc_zone_page_state(newpage, NR_FILE_PAGES);
331
332	spin_unlock_irq(&mapping->tree_lock);
333	if (!PageSwapCache(newpage))
334		mem_cgroup_uncharge_cache_page(page);
335
336	return 0;
337}
338
339/*
340 * Copy the page to its new location
341 */
342static void migrate_page_copy(struct page *newpage, struct page *page)
343{
344	copy_highpage(newpage, page);
345
346	if (PageError(page))
347		SetPageError(newpage);
348	if (PageReferenced(page))
349		SetPageReferenced(newpage);
350	if (PageUptodate(page))
351		SetPageUptodate(newpage);
352	if (TestClearPageActive(page)) {
353		VM_BUG_ON(PageUnevictable(page));
354		SetPageActive(newpage);
355	} else
356		unevictable_migrate_page(newpage, page);
357	if (PageChecked(page))
358		SetPageChecked(newpage);
359	if (PageMappedToDisk(page))
360		SetPageMappedToDisk(newpage);
361
362	if (PageDirty(page)) {
363		clear_page_dirty_for_io(page);
364		/*
365		 * Want to mark the page and the radix tree as dirty, and
366		 * redo the accounting that clear_page_dirty_for_io undid,
367		 * but we can't use set_page_dirty because that function
368		 * is actually a signal that all of the page has become dirty.
369		 * Wheras only part of our page may be dirty.
370		 */
371		__set_page_dirty_nobuffers(newpage);
372 	}
373
374	mlock_migrate_page(newpage, page);
375
376#ifdef CONFIG_SWAP
377	ClearPageSwapCache(page);
378#endif
379	ClearPagePrivate(page);
380	set_page_private(page, 0);
381	page->mapping = NULL;
382
383	/*
384	 * If any waiters have accumulated on the new page then
385	 * wake them up.
386	 */
387	if (PageWriteback(newpage))
388		end_page_writeback(newpage);
389}
390
391/************************************************************
392 *                    Migration functions
393 ***********************************************************/
394
395/* Always fail migration. Used for mappings that are not movable */
396int fail_migrate_page(struct address_space *mapping,
397			struct page *newpage, struct page *page)
398{
399	return -EIO;
400}
401EXPORT_SYMBOL(fail_migrate_page);
402
403/*
404 * Common logic to directly migrate a single page suitable for
405 * pages that do not use PagePrivate.
406 *
407 * Pages are locked upon entry and exit.
408 */
409int migrate_page(struct address_space *mapping,
410		struct page *newpage, struct page *page)
411{
412	int rc;
413
414	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
415
416	rc = migrate_page_move_mapping(mapping, newpage, page);
417
418	if (rc)
419		return rc;
420
421	migrate_page_copy(newpage, page);
422	return 0;
423}
424EXPORT_SYMBOL(migrate_page);
425
426#ifdef CONFIG_BLOCK
427/*
428 * Migration function for pages with buffers. This function can only be used
429 * if the underlying filesystem guarantees that no other references to "page"
430 * exist.
431 */
432int buffer_migrate_page(struct address_space *mapping,
433		struct page *newpage, struct page *page)
434{
435	struct buffer_head *bh, *head;
436	int rc;
437
438	if (!page_has_buffers(page))
439		return migrate_page(mapping, newpage, page);
440
441	head = page_buffers(page);
442
443	rc = migrate_page_move_mapping(mapping, newpage, page);
444
445	if (rc)
446		return rc;
447
448	bh = head;
449	do {
450		get_bh(bh);
451		lock_buffer(bh);
452		bh = bh->b_this_page;
453
454	} while (bh != head);
455
456	ClearPagePrivate(page);
457	set_page_private(newpage, page_private(page));
458	set_page_private(page, 0);
459	put_page(page);
460	get_page(newpage);
461
462	bh = head;
463	do {
464		set_bh_page(bh, newpage, bh_offset(bh));
465		bh = bh->b_this_page;
466
467	} while (bh != head);
468
469	SetPagePrivate(newpage);
470
471	migrate_page_copy(newpage, page);
472
473	bh = head;
474	do {
475		unlock_buffer(bh);
476 		put_bh(bh);
477		bh = bh->b_this_page;
478
479	} while (bh != head);
480
481	return 0;
482}
483EXPORT_SYMBOL(buffer_migrate_page);
484#endif
485
486/*
487 * Writeback a page to clean the dirty state
488 */
489static int writeout(struct address_space *mapping, struct page *page)
490{
491	struct writeback_control wbc = {
492		.sync_mode = WB_SYNC_NONE,
493		.nr_to_write = 1,
494		.range_start = 0,
495		.range_end = LLONG_MAX,
496		.nonblocking = 1,
497		.for_reclaim = 1
498	};
499	int rc;
500
501	if (!mapping->a_ops->writepage)
502		/* No write method for the address space */
503		return -EINVAL;
504
505	if (!clear_page_dirty_for_io(page))
506		/* Someone else already triggered a write */
507		return -EAGAIN;
508
509	/*
510	 * A dirty page may imply that the underlying filesystem has
511	 * the page on some queue. So the page must be clean for
512	 * migration. Writeout may mean we loose the lock and the
513	 * page state is no longer what we checked for earlier.
514	 * At this point we know that the migration attempt cannot
515	 * be successful.
516	 */
517	remove_migration_ptes(page, page);
518
519	rc = mapping->a_ops->writepage(page, &wbc);
520	if (rc < 0)
521		/* I/O Error writing */
522		return -EIO;
523
524	if (rc != AOP_WRITEPAGE_ACTIVATE)
525		/* unlocked. Relock */
526		lock_page(page);
527
528	return -EAGAIN;
529}
530
531/*
532 * Default handling if a filesystem does not provide a migration function.
533 */
534static int fallback_migrate_page(struct address_space *mapping,
535	struct page *newpage, struct page *page)
536{
537	if (PageDirty(page))
538		return writeout(mapping, page);
539
540	/*
541	 * Buffers may be managed in a filesystem specific way.
542	 * We must have no buffers or drop them.
543	 */
544	if (PagePrivate(page) &&
545	    !try_to_release_page(page, GFP_KERNEL))
546		return -EAGAIN;
547
548	return migrate_page(mapping, newpage, page);
549}
550
551/*
552 * Move a page to a newly allocated page
553 * The page is locked and all ptes have been successfully removed.
554 *
555 * The new page will have replaced the old page if this function
556 * is successful.
557 *
558 * Return value:
559 *   < 0 - error code
560 *  == 0 - success
561 */
562static int move_to_new_page(struct page *newpage, struct page *page)
563{
564	struct address_space *mapping;
565	int rc;
566
567	/*
568	 * Block others from accessing the page when we get around to
569	 * establishing additional references. We are the only one
570	 * holding a reference to the new page at this point.
571	 */
572	if (!trylock_page(newpage))
573		BUG();
574
575	/* Prepare mapping for the new page.*/
576	newpage->index = page->index;
577	newpage->mapping = page->mapping;
578	if (PageSwapBacked(page))
579		SetPageSwapBacked(newpage);
580
581	mapping = page_mapping(page);
582	if (!mapping)
583		rc = migrate_page(mapping, newpage, page);
584	else if (mapping->a_ops->migratepage)
585		/*
586		 * Most pages have a mapping and most filesystems
587		 * should provide a migration function. Anonymous
588		 * pages are part of swap space which also has its
589		 * own migration function. This is the most common
590		 * path for page migration.
591		 */
592		rc = mapping->a_ops->migratepage(mapping,
593						newpage, page);
594	else
595		rc = fallback_migrate_page(mapping, newpage, page);
596
597	if (!rc) {
598		remove_migration_ptes(page, newpage);
599	} else
600		newpage->mapping = NULL;
601
602	unlock_page(newpage);
603
604	return rc;
605}
606
607/*
608 * Obtain the lock on page, remove all ptes and migrate the page
609 * to the newly allocated page in newpage.
610 */
611static int unmap_and_move(new_page_t get_new_page, unsigned long private,
612			struct page *page, int force)
613{
614	int rc = 0;
615	int *result = NULL;
616	struct page *newpage = get_new_page(page, private, &result);
617	int rcu_locked = 0;
618	int charge = 0;
619
620	if (!newpage)
621		return -ENOMEM;
622
623	if (page_count(page) == 1) {
624		/* page was freed from under us. So we are done. */
625		goto move_newpage;
626	}
627
628	charge = mem_cgroup_prepare_migration(page, newpage);
629	if (charge == -ENOMEM) {
630		rc = -ENOMEM;
631		goto move_newpage;
632	}
633	/* prepare cgroup just returns 0 or -ENOMEM */
634	BUG_ON(charge);
635
636	rc = -EAGAIN;
637	if (!trylock_page(page)) {
638		if (!force)
639			goto move_newpage;
640		lock_page(page);
641	}
642
643	if (PageWriteback(page)) {
644		if (!force)
645			goto unlock;
646		wait_on_page_writeback(page);
647	}
648	/*
649	 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
650	 * we cannot notice that anon_vma is freed while we migrates a page.
651	 * This rcu_read_lock() delays freeing anon_vma pointer until the end
652	 * of migration. File cache pages are no problem because of page_lock()
653	 * File Caches may use write_page() or lock_page() in migration, then,
654	 * just care Anon page here.
655	 */
656	if (PageAnon(page)) {
657		rcu_read_lock();
658		rcu_locked = 1;
659	}
660
661	/*
662	 * Corner case handling:
663	 * 1. When a new swap-cache page is read into, it is added to the LRU
664	 * and treated as swapcache but it has no rmap yet.
665	 * Calling try_to_unmap() against a page->mapping==NULL page will
666	 * trigger a BUG.  So handle it here.
667	 * 2. An orphaned page (see truncate_complete_page) might have
668	 * fs-private metadata. The page can be picked up due to memory
669	 * offlining.  Everywhere else except page reclaim, the page is
670	 * invisible to the vm, so the page can not be migrated.  So try to
671	 * free the metadata, so the page can be freed.
672	 */
673	if (!page->mapping) {
674		if (!PageAnon(page) && PagePrivate(page)) {
675			/*
676			 * Go direct to try_to_free_buffers() here because
677			 * a) that's what try_to_release_page() would do anyway
678			 * b) we may be under rcu_read_lock() here, so we can't
679			 *    use GFP_KERNEL which is what try_to_release_page()
680			 *    needs to be effective.
681			 */
682			try_to_free_buffers(page);
683		}
684		goto rcu_unlock;
685	}
686
687	/* Establish migration ptes or remove ptes */
688	try_to_unmap(page, 1);
689
690	if (!page_mapped(page))
691		rc = move_to_new_page(newpage, page);
692
693	if (rc)
694		remove_migration_ptes(page, page);
695rcu_unlock:
696	if (rcu_locked)
697		rcu_read_unlock();
698
699unlock:
700	unlock_page(page);
701
702	if (rc != -EAGAIN) {
703 		/*
704 		 * A page that has been migrated has all references
705 		 * removed and will be freed. A page that has not been
706 		 * migrated will have kepts its references and be
707 		 * restored.
708 		 */
709 		list_del(&page->lru);
710		putback_lru_page(page);
711	}
712
713move_newpage:
714	if (!charge)
715		mem_cgroup_end_migration(newpage);
716
717	/*
718	 * Move the new page to the LRU. If migration was not successful
719	 * then this will free the page.
720	 */
721	putback_lru_page(newpage);
722
723	if (result) {
724		if (rc)
725			*result = rc;
726		else
727			*result = page_to_nid(newpage);
728	}
729	return rc;
730}
731
732/*
733 * migrate_pages
734 *
735 * The function takes one list of pages to migrate and a function
736 * that determines from the page to be migrated and the private data
737 * the target of the move and allocates the page.
738 *
739 * The function returns after 10 attempts or if no pages
740 * are movable anymore because to has become empty
741 * or no retryable pages exist anymore. All pages will be
742 * returned to the LRU or freed.
743 *
744 * Return: Number of pages not migrated or error code.
745 */
746int migrate_pages(struct list_head *from,
747		new_page_t get_new_page, unsigned long private)
748{
749	int retry = 1;
750	int nr_failed = 0;
751	int pass = 0;
752	struct page *page;
753	struct page *page2;
754	int swapwrite = current->flags & PF_SWAPWRITE;
755	int rc;
756
757	if (!swapwrite)
758		current->flags |= PF_SWAPWRITE;
759
760	for(pass = 0; pass < 10 && retry; pass++) {
761		retry = 0;
762
763		list_for_each_entry_safe(page, page2, from, lru) {
764			cond_resched();
765
766			rc = unmap_and_move(get_new_page, private,
767						page, pass > 2);
768
769			switch(rc) {
770			case -ENOMEM:
771				goto out;
772			case -EAGAIN:
773				retry++;
774				break;
775			case 0:
776				break;
777			default:
778				/* Permanent failure */
779				nr_failed++;
780				break;
781			}
782		}
783	}
784	rc = 0;
785out:
786	if (!swapwrite)
787		current->flags &= ~PF_SWAPWRITE;
788
789	putback_lru_pages(from);
790
791	if (rc)
792		return rc;
793
794	return nr_failed + retry;
795}
796
797#ifdef CONFIG_NUMA
798/*
799 * Move a list of individual pages
800 */
801struct page_to_node {
802	unsigned long addr;
803	struct page *page;
804	int node;
805	int status;
806};
807
808static struct page *new_page_node(struct page *p, unsigned long private,
809		int **result)
810{
811	struct page_to_node *pm = (struct page_to_node *)private;
812
813	while (pm->node != MAX_NUMNODES && pm->page != p)
814		pm++;
815
816	if (pm->node == MAX_NUMNODES)
817		return NULL;
818
819	*result = &pm->status;
820
821	return alloc_pages_node(pm->node,
822				GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
823}
824
825/*
826 * Move a set of pages as indicated in the pm array. The addr
827 * field must be set to the virtual address of the page to be moved
828 * and the node number must contain a valid target node.
829 */
830static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
831				int migrate_all)
832{
833	int err;
834	struct page_to_node *pp;
835	LIST_HEAD(pagelist);
836
837	down_read(&mm->mmap_sem);
838
839	/*
840	 * Build a list of pages to migrate
841	 */
842	migrate_prep();
843	for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
844		struct vm_area_struct *vma;
845		struct page *page;
846
847		/*
848		 * A valid page pointer that will not match any of the
849		 * pages that will be moved.
850		 */
851		pp->page = ZERO_PAGE(0);
852
853		err = -EFAULT;
854		vma = find_vma(mm, pp->addr);
855		if (!vma || !vma_migratable(vma))
856			goto set_status;
857
858		page = follow_page(vma, pp->addr, FOLL_GET);
859
860		err = PTR_ERR(page);
861		if (IS_ERR(page))
862			goto set_status;
863
864		err = -ENOENT;
865		if (!page)
866			goto set_status;
867
868		if (PageReserved(page))		/* Check for zero page */
869			goto put_and_set;
870
871		pp->page = page;
872		err = page_to_nid(page);
873
874		if (err == pp->node)
875			/*
876			 * Node already in the right place
877			 */
878			goto put_and_set;
879
880		err = -EACCES;
881		if (page_mapcount(page) > 1 &&
882				!migrate_all)
883			goto put_and_set;
884
885		err = isolate_lru_page(page);
886		if (!err)
887			list_add_tail(&page->lru, &pagelist);
888put_and_set:
889		/*
890		 * Either remove the duplicate refcount from
891		 * isolate_lru_page() or drop the page ref if it was
892		 * not isolated.
893		 */
894		put_page(page);
895set_status:
896		pp->status = err;
897	}
898
899	err = 0;
900	if (!list_empty(&pagelist))
901		err = migrate_pages(&pagelist, new_page_node,
902				(unsigned long)pm);
903
904	up_read(&mm->mmap_sem);
905	return err;
906}
907
908/*
909 * Determine the nodes of a list of pages. The addr in the pm array
910 * must have been set to the virtual address of which we want to determine
911 * the node number.
912 */
913static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
914{
915	down_read(&mm->mmap_sem);
916
917	for ( ; pm->node != MAX_NUMNODES; pm++) {
918		struct vm_area_struct *vma;
919		struct page *page;
920		int err;
921
922		err = -EFAULT;
923		vma = find_vma(mm, pm->addr);
924		if (!vma)
925			goto set_status;
926
927		page = follow_page(vma, pm->addr, 0);
928
929		err = PTR_ERR(page);
930		if (IS_ERR(page))
931			goto set_status;
932
933		err = -ENOENT;
934		/* Use PageReserved to check for zero page */
935		if (!page || PageReserved(page))
936			goto set_status;
937
938		err = page_to_nid(page);
939set_status:
940		pm->status = err;
941	}
942
943	up_read(&mm->mmap_sem);
944	return 0;
945}
946
947/*
948 * Move a list of pages in the address space of the currently executing
949 * process.
950 */
951asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
952			const void __user * __user *pages,
953			const int __user *nodes,
954			int __user *status, int flags)
955{
956	int err = 0;
957	int i;
958	struct task_struct *task;
959	nodemask_t task_nodes;
960	struct mm_struct *mm;
961	struct page_to_node *pm = NULL;
962
963	/* Check flags */
964	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
965		return -EINVAL;
966
967	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
968		return -EPERM;
969
970	/* Find the mm_struct */
971	read_lock(&tasklist_lock);
972	task = pid ? find_task_by_vpid(pid) : current;
973	if (!task) {
974		read_unlock(&tasklist_lock);
975		return -ESRCH;
976	}
977	mm = get_task_mm(task);
978	read_unlock(&tasklist_lock);
979
980	if (!mm)
981		return -EINVAL;
982
983	/*
984	 * Check if this process has the right to modify the specified
985	 * process. The right exists if the process has administrative
986	 * capabilities, superuser privileges or the same
987	 * userid as the target process.
988	 */
989	if ((current->euid != task->suid) && (current->euid != task->uid) &&
990	    (current->uid != task->suid) && (current->uid != task->uid) &&
991	    !capable(CAP_SYS_NICE)) {
992		err = -EPERM;
993		goto out2;
994	}
995
996 	err = security_task_movememory(task);
997 	if (err)
998 		goto out2;
999
1000
1001	task_nodes = cpuset_mems_allowed(task);
1002
1003	/* Limit nr_pages so that the multiplication may not overflow */
1004	if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
1005		err = -E2BIG;
1006		goto out2;
1007	}
1008
1009	pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
1010	if (!pm) {
1011		err = -ENOMEM;
1012		goto out2;
1013	}
1014
1015	/*
1016	 * Get parameters from user space and initialize the pm
1017	 * array. Return various errors if the user did something wrong.
1018	 */
1019	for (i = 0; i < nr_pages; i++) {
1020		const void __user *p;
1021
1022		err = -EFAULT;
1023		if (get_user(p, pages + i))
1024			goto out;
1025
1026		pm[i].addr = (unsigned long)p;
1027		if (nodes) {
1028			int node;
1029
1030			if (get_user(node, nodes + i))
1031				goto out;
1032
1033			err = -ENODEV;
1034			if (!node_state(node, N_HIGH_MEMORY))
1035				goto out;
1036
1037			err = -EACCES;
1038			if (!node_isset(node, task_nodes))
1039				goto out;
1040
1041			pm[i].node = node;
1042		} else
1043			pm[i].node = 0;	/* anything to not match MAX_NUMNODES */
1044	}
1045	/* End marker */
1046	pm[nr_pages].node = MAX_NUMNODES;
1047
1048	if (nodes)
1049		err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL);
1050	else
1051		err = do_pages_stat(mm, pm);
1052
1053	if (err >= 0)
1054		/* Return status information */
1055		for (i = 0; i < nr_pages; i++)
1056			if (put_user(pm[i].status, status + i))
1057				err = -EFAULT;
1058
1059out:
1060	vfree(pm);
1061out2:
1062	mmput(mm);
1063	return err;
1064}
1065
1066/*
1067 * Call migration functions in the vma_ops that may prepare
1068 * memory in a vm for migration. migration functions may perform
1069 * the migration for vmas that do not have an underlying page struct.
1070 */
1071int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1072	const nodemask_t *from, unsigned long flags)
1073{
1074 	struct vm_area_struct *vma;
1075 	int err = 0;
1076
1077 	for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) {
1078 		if (vma->vm_ops && vma->vm_ops->migrate) {
1079 			err = vma->vm_ops->migrate(vma, to, from, flags);
1080 			if (err)
1081 				break;
1082 		}
1083 	}
1084 	return err;
1085}
1086#endif
1087