migrate.c revision 7a81b88cb53e335ff7d019e6398c95792c817d93
1/*
2 * Memory Migration functionality - linux/mm/migration.c
3 *
4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
5 *
6 * Page migration was first developed in the context of the memory hotplug
7 * project. The main authors of the migration code are:
8 *
9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
10 * Hirokazu Takahashi <taka@valinux.co.jp>
11 * Dave Hansen <haveblue@us.ibm.com>
12 * Christoph Lameter
13 */
14
15#include <linux/migrate.h>
16#include <linux/module.h>
17#include <linux/swap.h>
18#include <linux/swapops.h>
19#include <linux/pagemap.h>
20#include <linux/buffer_head.h>
21#include <linux/mm_inline.h>
22#include <linux/nsproxy.h>
23#include <linux/pagevec.h>
24#include <linux/rmap.h>
25#include <linux/topology.h>
26#include <linux/cpu.h>
27#include <linux/cpuset.h>
28#include <linux/writeback.h>
29#include <linux/mempolicy.h>
30#include <linux/vmalloc.h>
31#include <linux/security.h>
32#include <linux/memcontrol.h>
33#include <linux/syscalls.h>
34
35#include "internal.h"
36
37#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
38
39/*
40 * migrate_prep() needs to be called before we start compiling a list of pages
41 * to be migrated using isolate_lru_page().
42 */
43int migrate_prep(void)
44{
45	/*
46	 * Clear the LRU lists so pages can be isolated.
47	 * Note that pages may be moved off the LRU after we have
48	 * drained them. Those pages will fail to migrate like other
49	 * pages that may be busy.
50	 */
51	lru_add_drain_all();
52
53	return 0;
54}
55
56/*
57 * Add isolated pages on the list back to the LRU under page lock
58 * to avoid leaking evictable pages back onto unevictable list.
59 *
60 * returns the number of pages put back.
61 */
62int putback_lru_pages(struct list_head *l)
63{
64	struct page *page;
65	struct page *page2;
66	int count = 0;
67
68	list_for_each_entry_safe(page, page2, l, lru) {
69		list_del(&page->lru);
70		putback_lru_page(page);
71		count++;
72	}
73	return count;
74}
75
76/*
77 * Restore a potential migration pte to a working pte entry
78 */
79static void remove_migration_pte(struct vm_area_struct *vma,
80		struct page *old, struct page *new)
81{
82	struct mm_struct *mm = vma->vm_mm;
83	swp_entry_t entry;
84 	pgd_t *pgd;
85 	pud_t *pud;
86 	pmd_t *pmd;
87	pte_t *ptep, pte;
88 	spinlock_t *ptl;
89	unsigned long addr = page_address_in_vma(new, vma);
90
91	if (addr == -EFAULT)
92		return;
93
94 	pgd = pgd_offset(mm, addr);
95	if (!pgd_present(*pgd))
96                return;
97
98	pud = pud_offset(pgd, addr);
99	if (!pud_present(*pud))
100                return;
101
102	pmd = pmd_offset(pud, addr);
103	if (!pmd_present(*pmd))
104		return;
105
106	ptep = pte_offset_map(pmd, addr);
107
108	if (!is_swap_pte(*ptep)) {
109		pte_unmap(ptep);
110 		return;
111 	}
112
113 	ptl = pte_lockptr(mm, pmd);
114 	spin_lock(ptl);
115	pte = *ptep;
116	if (!is_swap_pte(pte))
117		goto out;
118
119	entry = pte_to_swp_entry(pte);
120
121	if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
122		goto out;
123
124	/*
125	 * Yes, ignore the return value from a GFP_ATOMIC mem_cgroup_charge.
126	 * Failure is not an option here: we're now expected to remove every
127	 * migration pte, and will cause crashes otherwise.  Normally this
128	 * is not an issue: mem_cgroup_prepare_migration bumped up the old
129	 * page_cgroup count for safety, that's now attached to the new page,
130	 * so this charge should just be another incrementation of the count,
131	 * to keep in balance with rmap.c's mem_cgroup_uncharging.  But if
132	 * there's been a force_empty, those reference counts may no longer
133	 * be reliable, and this charge can actually fail: oh well, we don't
134	 * make the situation any worse by proceeding as if it had succeeded.
135	 */
136	mem_cgroup_charge_migrate_fixup(new, mm, GFP_ATOMIC);
137
138	get_page(new);
139	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
140	if (is_write_migration_entry(entry))
141		pte = pte_mkwrite(pte);
142	flush_cache_page(vma, addr, pte_pfn(pte));
143	set_pte_at(mm, addr, ptep, pte);
144
145	if (PageAnon(new))
146		page_add_anon_rmap(new, vma, addr);
147	else
148		page_add_file_rmap(new);
149
150	/* No need to invalidate - it was non-present before */
151	update_mmu_cache(vma, addr, pte);
152
153out:
154	pte_unmap_unlock(ptep, ptl);
155}
156
157/*
158 * Note that remove_file_migration_ptes will only work on regular mappings,
159 * Nonlinear mappings do not use migration entries.
160 */
161static void remove_file_migration_ptes(struct page *old, struct page *new)
162{
163	struct vm_area_struct *vma;
164	struct address_space *mapping = page_mapping(new);
165	struct prio_tree_iter iter;
166	pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
167
168	if (!mapping)
169		return;
170
171	spin_lock(&mapping->i_mmap_lock);
172
173	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
174		remove_migration_pte(vma, old, new);
175
176	spin_unlock(&mapping->i_mmap_lock);
177}
178
179/*
180 * Must hold mmap_sem lock on at least one of the vmas containing
181 * the page so that the anon_vma cannot vanish.
182 */
183static void remove_anon_migration_ptes(struct page *old, struct page *new)
184{
185	struct anon_vma *anon_vma;
186	struct vm_area_struct *vma;
187	unsigned long mapping;
188
189	mapping = (unsigned long)new->mapping;
190
191	if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
192		return;
193
194	/*
195	 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
196	 */
197	anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
198	spin_lock(&anon_vma->lock);
199
200	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
201		remove_migration_pte(vma, old, new);
202
203	spin_unlock(&anon_vma->lock);
204}
205
206/*
207 * Get rid of all migration entries and replace them by
208 * references to the indicated page.
209 */
210static void remove_migration_ptes(struct page *old, struct page *new)
211{
212	if (PageAnon(new))
213		remove_anon_migration_ptes(old, new);
214	else
215		remove_file_migration_ptes(old, new);
216}
217
218/*
219 * Something used the pte of a page under migration. We need to
220 * get to the page and wait until migration is finished.
221 * When we return from this function the fault will be retried.
222 *
223 * This function is called from do_swap_page().
224 */
225void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
226				unsigned long address)
227{
228	pte_t *ptep, pte;
229	spinlock_t *ptl;
230	swp_entry_t entry;
231	struct page *page;
232
233	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
234	pte = *ptep;
235	if (!is_swap_pte(pte))
236		goto out;
237
238	entry = pte_to_swp_entry(pte);
239	if (!is_migration_entry(entry))
240		goto out;
241
242	page = migration_entry_to_page(entry);
243
244	/*
245	 * Once radix-tree replacement of page migration started, page_count
246	 * *must* be zero. And, we don't want to call wait_on_page_locked()
247	 * against a page without get_page().
248	 * So, we use get_page_unless_zero(), here. Even failed, page fault
249	 * will occur again.
250	 */
251	if (!get_page_unless_zero(page))
252		goto out;
253	pte_unmap_unlock(ptep, ptl);
254	wait_on_page_locked(page);
255	put_page(page);
256	return;
257out:
258	pte_unmap_unlock(ptep, ptl);
259}
260
261/*
262 * Replace the page in the mapping.
263 *
264 * The number of remaining references must be:
265 * 1 for anonymous pages without a mapping
266 * 2 for pages with a mapping
267 * 3 for pages with a mapping and PagePrivate set.
268 */
269static int migrate_page_move_mapping(struct address_space *mapping,
270		struct page *newpage, struct page *page)
271{
272	int expected_count;
273	void **pslot;
274
275	if (!mapping) {
276		/* Anonymous page without mapping */
277		if (page_count(page) != 1)
278			return -EAGAIN;
279		return 0;
280	}
281
282	spin_lock_irq(&mapping->tree_lock);
283
284	pslot = radix_tree_lookup_slot(&mapping->page_tree,
285 					page_index(page));
286
287	expected_count = 2 + !!PagePrivate(page);
288	if (page_count(page) != expected_count ||
289			(struct page *)radix_tree_deref_slot(pslot) != page) {
290		spin_unlock_irq(&mapping->tree_lock);
291		return -EAGAIN;
292	}
293
294	if (!page_freeze_refs(page, expected_count)) {
295		spin_unlock_irq(&mapping->tree_lock);
296		return -EAGAIN;
297	}
298
299	/*
300	 * Now we know that no one else is looking at the page.
301	 */
302	get_page(newpage);	/* add cache reference */
303	if (PageSwapCache(page)) {
304		SetPageSwapCache(newpage);
305		set_page_private(newpage, page_private(page));
306	}
307
308	radix_tree_replace_slot(pslot, newpage);
309
310	page_unfreeze_refs(page, expected_count);
311	/*
312	 * Drop cache reference from old page.
313	 * We know this isn't the last reference.
314	 */
315	__put_page(page);
316
317	/*
318	 * If moved to a different zone then also account
319	 * the page for that zone. Other VM counters will be
320	 * taken care of when we establish references to the
321	 * new page and drop references to the old page.
322	 *
323	 * Note that anonymous pages are accounted for
324	 * via NR_FILE_PAGES and NR_ANON_PAGES if they
325	 * are mapped to swap space.
326	 */
327	__dec_zone_page_state(page, NR_FILE_PAGES);
328	__inc_zone_page_state(newpage, NR_FILE_PAGES);
329
330	spin_unlock_irq(&mapping->tree_lock);
331
332	return 0;
333}
334
335/*
336 * Copy the page to its new location
337 */
338static void migrate_page_copy(struct page *newpage, struct page *page)
339{
340	int anon;
341
342	copy_highpage(newpage, page);
343
344	if (PageError(page))
345		SetPageError(newpage);
346	if (PageReferenced(page))
347		SetPageReferenced(newpage);
348	if (PageUptodate(page))
349		SetPageUptodate(newpage);
350	if (TestClearPageActive(page)) {
351		VM_BUG_ON(PageUnevictable(page));
352		SetPageActive(newpage);
353	} else
354		unevictable_migrate_page(newpage, page);
355	if (PageChecked(page))
356		SetPageChecked(newpage);
357	if (PageMappedToDisk(page))
358		SetPageMappedToDisk(newpage);
359
360	if (PageDirty(page)) {
361		clear_page_dirty_for_io(page);
362		/*
363		 * Want to mark the page and the radix tree as dirty, and
364		 * redo the accounting that clear_page_dirty_for_io undid,
365		 * but we can't use set_page_dirty because that function
366		 * is actually a signal that all of the page has become dirty.
367		 * Wheras only part of our page may be dirty.
368		 */
369		__set_page_dirty_nobuffers(newpage);
370 	}
371
372	mlock_migrate_page(newpage, page);
373
374	ClearPageSwapCache(page);
375	ClearPagePrivate(page);
376	set_page_private(page, 0);
377	/* page->mapping contains a flag for PageAnon() */
378	anon = PageAnon(page);
379	page->mapping = NULL;
380
381	if (!anon) /* This page was removed from radix-tree. */
382		mem_cgroup_uncharge_cache_page(page);
383
384	/*
385	 * If any waiters have accumulated on the new page then
386	 * wake them up.
387	 */
388	if (PageWriteback(newpage))
389		end_page_writeback(newpage);
390}
391
392/************************************************************
393 *                    Migration functions
394 ***********************************************************/
395
396/* Always fail migration. Used for mappings that are not movable */
397int fail_migrate_page(struct address_space *mapping,
398			struct page *newpage, struct page *page)
399{
400	return -EIO;
401}
402EXPORT_SYMBOL(fail_migrate_page);
403
404/*
405 * Common logic to directly migrate a single page suitable for
406 * pages that do not use PagePrivate.
407 *
408 * Pages are locked upon entry and exit.
409 */
410int migrate_page(struct address_space *mapping,
411		struct page *newpage, struct page *page)
412{
413	int rc;
414
415	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
416
417	rc = migrate_page_move_mapping(mapping, newpage, page);
418
419	if (rc)
420		return rc;
421
422	migrate_page_copy(newpage, page);
423	return 0;
424}
425EXPORT_SYMBOL(migrate_page);
426
427#ifdef CONFIG_BLOCK
428/*
429 * Migration function for pages with buffers. This function can only be used
430 * if the underlying filesystem guarantees that no other references to "page"
431 * exist.
432 */
433int buffer_migrate_page(struct address_space *mapping,
434		struct page *newpage, struct page *page)
435{
436	struct buffer_head *bh, *head;
437	int rc;
438
439	if (!page_has_buffers(page))
440		return migrate_page(mapping, newpage, page);
441
442	head = page_buffers(page);
443
444	rc = migrate_page_move_mapping(mapping, newpage, page);
445
446	if (rc)
447		return rc;
448
449	bh = head;
450	do {
451		get_bh(bh);
452		lock_buffer(bh);
453		bh = bh->b_this_page;
454
455	} while (bh != head);
456
457	ClearPagePrivate(page);
458	set_page_private(newpage, page_private(page));
459	set_page_private(page, 0);
460	put_page(page);
461	get_page(newpage);
462
463	bh = head;
464	do {
465		set_bh_page(bh, newpage, bh_offset(bh));
466		bh = bh->b_this_page;
467
468	} while (bh != head);
469
470	SetPagePrivate(newpage);
471
472	migrate_page_copy(newpage, page);
473
474	bh = head;
475	do {
476		unlock_buffer(bh);
477 		put_bh(bh);
478		bh = bh->b_this_page;
479
480	} while (bh != head);
481
482	return 0;
483}
484EXPORT_SYMBOL(buffer_migrate_page);
485#endif
486
487/*
488 * Writeback a page to clean the dirty state
489 */
490static int writeout(struct address_space *mapping, struct page *page)
491{
492	struct writeback_control wbc = {
493		.sync_mode = WB_SYNC_NONE,
494		.nr_to_write = 1,
495		.range_start = 0,
496		.range_end = LLONG_MAX,
497		.nonblocking = 1,
498		.for_reclaim = 1
499	};
500	int rc;
501
502	if (!mapping->a_ops->writepage)
503		/* No write method for the address space */
504		return -EINVAL;
505
506	if (!clear_page_dirty_for_io(page))
507		/* Someone else already triggered a write */
508		return -EAGAIN;
509
510	/*
511	 * A dirty page may imply that the underlying filesystem has
512	 * the page on some queue. So the page must be clean for
513	 * migration. Writeout may mean we loose the lock and the
514	 * page state is no longer what we checked for earlier.
515	 * At this point we know that the migration attempt cannot
516	 * be successful.
517	 */
518	remove_migration_ptes(page, page);
519
520	rc = mapping->a_ops->writepage(page, &wbc);
521
522	if (rc != AOP_WRITEPAGE_ACTIVATE)
523		/* unlocked. Relock */
524		lock_page(page);
525
526	return (rc < 0) ? -EIO : -EAGAIN;
527}
528
529/*
530 * Default handling if a filesystem does not provide a migration function.
531 */
532static int fallback_migrate_page(struct address_space *mapping,
533	struct page *newpage, struct page *page)
534{
535	if (PageDirty(page))
536		return writeout(mapping, page);
537
538	/*
539	 * Buffers may be managed in a filesystem specific way.
540	 * We must have no buffers or drop them.
541	 */
542	if (PagePrivate(page) &&
543	    !try_to_release_page(page, GFP_KERNEL))
544		return -EAGAIN;
545
546	return migrate_page(mapping, newpage, page);
547}
548
549/*
550 * Move a page to a newly allocated page
551 * The page is locked and all ptes have been successfully removed.
552 *
553 * The new page will have replaced the old page if this function
554 * is successful.
555 *
556 * Return value:
557 *   < 0 - error code
558 *  == 0 - success
559 */
560static int move_to_new_page(struct page *newpage, struct page *page)
561{
562	struct address_space *mapping;
563	int rc;
564
565	/*
566	 * Block others from accessing the page when we get around to
567	 * establishing additional references. We are the only one
568	 * holding a reference to the new page at this point.
569	 */
570	if (!trylock_page(newpage))
571		BUG();
572
573	/* Prepare mapping for the new page.*/
574	newpage->index = page->index;
575	newpage->mapping = page->mapping;
576	if (PageSwapBacked(page))
577		SetPageSwapBacked(newpage);
578
579	mapping = page_mapping(page);
580	if (!mapping)
581		rc = migrate_page(mapping, newpage, page);
582	else if (mapping->a_ops->migratepage)
583		/*
584		 * Most pages have a mapping and most filesystems
585		 * should provide a migration function. Anonymous
586		 * pages are part of swap space which also has its
587		 * own migration function. This is the most common
588		 * path for page migration.
589		 */
590		rc = mapping->a_ops->migratepage(mapping,
591						newpage, page);
592	else
593		rc = fallback_migrate_page(mapping, newpage, page);
594
595	if (!rc) {
596		remove_migration_ptes(page, newpage);
597	} else
598		newpage->mapping = NULL;
599
600	unlock_page(newpage);
601
602	return rc;
603}
604
605/*
606 * Obtain the lock on page, remove all ptes and migrate the page
607 * to the newly allocated page in newpage.
608 */
609static int unmap_and_move(new_page_t get_new_page, unsigned long private,
610			struct page *page, int force)
611{
612	int rc = 0;
613	int *result = NULL;
614	struct page *newpage = get_new_page(page, private, &result);
615	int rcu_locked = 0;
616	int charge = 0;
617
618	if (!newpage)
619		return -ENOMEM;
620
621	if (page_count(page) == 1) {
622		/* page was freed from under us. So we are done. */
623		goto move_newpage;
624	}
625
626	charge = mem_cgroup_prepare_migration(page, newpage);
627	if (charge == -ENOMEM) {
628		rc = -ENOMEM;
629		goto move_newpage;
630	}
631	/* prepare cgroup just returns 0 or -ENOMEM */
632	BUG_ON(charge);
633
634	rc = -EAGAIN;
635	if (!trylock_page(page)) {
636		if (!force)
637			goto move_newpage;
638		lock_page(page);
639	}
640
641	if (PageWriteback(page)) {
642		if (!force)
643			goto unlock;
644		wait_on_page_writeback(page);
645	}
646	/*
647	 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
648	 * we cannot notice that anon_vma is freed while we migrates a page.
649	 * This rcu_read_lock() delays freeing anon_vma pointer until the end
650	 * of migration. File cache pages are no problem because of page_lock()
651	 * File Caches may use write_page() or lock_page() in migration, then,
652	 * just care Anon page here.
653	 */
654	if (PageAnon(page)) {
655		rcu_read_lock();
656		rcu_locked = 1;
657	}
658
659	/*
660	 * Corner case handling:
661	 * 1. When a new swap-cache page is read into, it is added to the LRU
662	 * and treated as swapcache but it has no rmap yet.
663	 * Calling try_to_unmap() against a page->mapping==NULL page will
664	 * trigger a BUG.  So handle it here.
665	 * 2. An orphaned page (see truncate_complete_page) might have
666	 * fs-private metadata. The page can be picked up due to memory
667	 * offlining.  Everywhere else except page reclaim, the page is
668	 * invisible to the vm, so the page can not be migrated.  So try to
669	 * free the metadata, so the page can be freed.
670	 */
671	if (!page->mapping) {
672		if (!PageAnon(page) && PagePrivate(page)) {
673			/*
674			 * Go direct to try_to_free_buffers() here because
675			 * a) that's what try_to_release_page() would do anyway
676			 * b) we may be under rcu_read_lock() here, so we can't
677			 *    use GFP_KERNEL which is what try_to_release_page()
678			 *    needs to be effective.
679			 */
680			try_to_free_buffers(page);
681		}
682		goto rcu_unlock;
683	}
684
685	/* Establish migration ptes or remove ptes */
686	try_to_unmap(page, 1);
687
688	if (!page_mapped(page))
689		rc = move_to_new_page(newpage, page);
690
691	if (rc)
692		remove_migration_ptes(page, page);
693rcu_unlock:
694	if (rcu_locked)
695		rcu_read_unlock();
696
697unlock:
698	unlock_page(page);
699
700	if (rc != -EAGAIN) {
701 		/*
702 		 * A page that has been migrated has all references
703 		 * removed and will be freed. A page that has not been
704 		 * migrated will have kepts its references and be
705 		 * restored.
706 		 */
707 		list_del(&page->lru);
708		putback_lru_page(page);
709	}
710
711move_newpage:
712	if (!charge)
713		mem_cgroup_end_migration(newpage);
714
715	/*
716	 * Move the new page to the LRU. If migration was not successful
717	 * then this will free the page.
718	 */
719	putback_lru_page(newpage);
720
721	if (result) {
722		if (rc)
723			*result = rc;
724		else
725			*result = page_to_nid(newpage);
726	}
727	return rc;
728}
729
730/*
731 * migrate_pages
732 *
733 * The function takes one list of pages to migrate and a function
734 * that determines from the page to be migrated and the private data
735 * the target of the move and allocates the page.
736 *
737 * The function returns after 10 attempts or if no pages
738 * are movable anymore because to has become empty
739 * or no retryable pages exist anymore. All pages will be
740 * returned to the LRU or freed.
741 *
742 * Return: Number of pages not migrated or error code.
743 */
744int migrate_pages(struct list_head *from,
745		new_page_t get_new_page, unsigned long private)
746{
747	int retry = 1;
748	int nr_failed = 0;
749	int pass = 0;
750	struct page *page;
751	struct page *page2;
752	int swapwrite = current->flags & PF_SWAPWRITE;
753	int rc;
754
755	if (!swapwrite)
756		current->flags |= PF_SWAPWRITE;
757
758	for(pass = 0; pass < 10 && retry; pass++) {
759		retry = 0;
760
761		list_for_each_entry_safe(page, page2, from, lru) {
762			cond_resched();
763
764			rc = unmap_and_move(get_new_page, private,
765						page, pass > 2);
766
767			switch(rc) {
768			case -ENOMEM:
769				goto out;
770			case -EAGAIN:
771				retry++;
772				break;
773			case 0:
774				break;
775			default:
776				/* Permanent failure */
777				nr_failed++;
778				break;
779			}
780		}
781	}
782	rc = 0;
783out:
784	if (!swapwrite)
785		current->flags &= ~PF_SWAPWRITE;
786
787	putback_lru_pages(from);
788
789	if (rc)
790		return rc;
791
792	return nr_failed + retry;
793}
794
795#ifdef CONFIG_NUMA
796/*
797 * Move a list of individual pages
798 */
799struct page_to_node {
800	unsigned long addr;
801	struct page *page;
802	int node;
803	int status;
804};
805
806static struct page *new_page_node(struct page *p, unsigned long private,
807		int **result)
808{
809	struct page_to_node *pm = (struct page_to_node *)private;
810
811	while (pm->node != MAX_NUMNODES && pm->page != p)
812		pm++;
813
814	if (pm->node == MAX_NUMNODES)
815		return NULL;
816
817	*result = &pm->status;
818
819	return alloc_pages_node(pm->node,
820				GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
821}
822
823/*
824 * Move a set of pages as indicated in the pm array. The addr
825 * field must be set to the virtual address of the page to be moved
826 * and the node number must contain a valid target node.
827 * The pm array ends with node = MAX_NUMNODES.
828 */
829static int do_move_page_to_node_array(struct mm_struct *mm,
830				      struct page_to_node *pm,
831				      int migrate_all)
832{
833	int err;
834	struct page_to_node *pp;
835	LIST_HEAD(pagelist);
836
837	migrate_prep();
838	down_read(&mm->mmap_sem);
839
840	/*
841	 * Build a list of pages to migrate
842	 */
843	for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
844		struct vm_area_struct *vma;
845		struct page *page;
846
847		err = -EFAULT;
848		vma = find_vma(mm, pp->addr);
849		if (!vma || !vma_migratable(vma))
850			goto set_status;
851
852		page = follow_page(vma, pp->addr, FOLL_GET);
853
854		err = PTR_ERR(page);
855		if (IS_ERR(page))
856			goto set_status;
857
858		err = -ENOENT;
859		if (!page)
860			goto set_status;
861
862		if (PageReserved(page))		/* Check for zero page */
863			goto put_and_set;
864
865		pp->page = page;
866		err = page_to_nid(page);
867
868		if (err == pp->node)
869			/*
870			 * Node already in the right place
871			 */
872			goto put_and_set;
873
874		err = -EACCES;
875		if (page_mapcount(page) > 1 &&
876				!migrate_all)
877			goto put_and_set;
878
879		err = isolate_lru_page(page);
880		if (!err)
881			list_add_tail(&page->lru, &pagelist);
882put_and_set:
883		/*
884		 * Either remove the duplicate refcount from
885		 * isolate_lru_page() or drop the page ref if it was
886		 * not isolated.
887		 */
888		put_page(page);
889set_status:
890		pp->status = err;
891	}
892
893	err = 0;
894	if (!list_empty(&pagelist))
895		err = migrate_pages(&pagelist, new_page_node,
896				(unsigned long)pm);
897
898	up_read(&mm->mmap_sem);
899	return err;
900}
901
902/*
903 * Migrate an array of page address onto an array of nodes and fill
904 * the corresponding array of status.
905 */
906static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
907			 unsigned long nr_pages,
908			 const void __user * __user *pages,
909			 const int __user *nodes,
910			 int __user *status, int flags)
911{
912	struct page_to_node *pm;
913	nodemask_t task_nodes;
914	unsigned long chunk_nr_pages;
915	unsigned long chunk_start;
916	int err;
917
918	task_nodes = cpuset_mems_allowed(task);
919
920	err = -ENOMEM;
921	pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
922	if (!pm)
923		goto out;
924	/*
925	 * Store a chunk of page_to_node array in a page,
926	 * but keep the last one as a marker
927	 */
928	chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
929
930	for (chunk_start = 0;
931	     chunk_start < nr_pages;
932	     chunk_start += chunk_nr_pages) {
933		int j;
934
935		if (chunk_start + chunk_nr_pages > nr_pages)
936			chunk_nr_pages = nr_pages - chunk_start;
937
938		/* fill the chunk pm with addrs and nodes from user-space */
939		for (j = 0; j < chunk_nr_pages; j++) {
940			const void __user *p;
941			int node;
942
943			err = -EFAULT;
944			if (get_user(p, pages + j + chunk_start))
945				goto out_pm;
946			pm[j].addr = (unsigned long) p;
947
948			if (get_user(node, nodes + j + chunk_start))
949				goto out_pm;
950
951			err = -ENODEV;
952			if (!node_state(node, N_HIGH_MEMORY))
953				goto out_pm;
954
955			err = -EACCES;
956			if (!node_isset(node, task_nodes))
957				goto out_pm;
958
959			pm[j].node = node;
960		}
961
962		/* End marker for this chunk */
963		pm[chunk_nr_pages].node = MAX_NUMNODES;
964
965		/* Migrate this chunk */
966		err = do_move_page_to_node_array(mm, pm,
967						 flags & MPOL_MF_MOVE_ALL);
968		if (err < 0)
969			goto out_pm;
970
971		/* Return status information */
972		for (j = 0; j < chunk_nr_pages; j++)
973			if (put_user(pm[j].status, status + j + chunk_start)) {
974				err = -EFAULT;
975				goto out_pm;
976			}
977	}
978	err = 0;
979
980out_pm:
981	free_page((unsigned long)pm);
982out:
983	return err;
984}
985
986/*
987 * Determine the nodes of an array of pages and store it in an array of status.
988 */
989static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
990				const void __user **pages, int *status)
991{
992	unsigned long i;
993
994	down_read(&mm->mmap_sem);
995
996	for (i = 0; i < nr_pages; i++) {
997		unsigned long addr = (unsigned long)(*pages);
998		struct vm_area_struct *vma;
999		struct page *page;
1000		int err = -EFAULT;
1001
1002		vma = find_vma(mm, addr);
1003		if (!vma)
1004			goto set_status;
1005
1006		page = follow_page(vma, addr, 0);
1007
1008		err = PTR_ERR(page);
1009		if (IS_ERR(page))
1010			goto set_status;
1011
1012		err = -ENOENT;
1013		/* Use PageReserved to check for zero page */
1014		if (!page || PageReserved(page))
1015			goto set_status;
1016
1017		err = page_to_nid(page);
1018set_status:
1019		*status = err;
1020
1021		pages++;
1022		status++;
1023	}
1024
1025	up_read(&mm->mmap_sem);
1026}
1027
1028/*
1029 * Determine the nodes of a user array of pages and store it in
1030 * a user array of status.
1031 */
1032static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
1033			 const void __user * __user *pages,
1034			 int __user *status)
1035{
1036#define DO_PAGES_STAT_CHUNK_NR 16
1037	const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
1038	int chunk_status[DO_PAGES_STAT_CHUNK_NR];
1039	unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR;
1040	int err;
1041
1042	for (i = 0; i < nr_pages; i += chunk_nr) {
1043		if (chunk_nr + i > nr_pages)
1044			chunk_nr = nr_pages - i;
1045
1046		err = copy_from_user(chunk_pages, &pages[i],
1047				     chunk_nr * sizeof(*chunk_pages));
1048		if (err) {
1049			err = -EFAULT;
1050			goto out;
1051		}
1052
1053		do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
1054
1055		err = copy_to_user(&status[i], chunk_status,
1056				   chunk_nr * sizeof(*chunk_status));
1057		if (err) {
1058			err = -EFAULT;
1059			goto out;
1060		}
1061	}
1062	err = 0;
1063
1064out:
1065	return err;
1066}
1067
1068/*
1069 * Move a list of pages in the address space of the currently executing
1070 * process.
1071 */
1072asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
1073			const void __user * __user *pages,
1074			const int __user *nodes,
1075			int __user *status, int flags)
1076{
1077	const struct cred *cred = current_cred(), *tcred;
1078	struct task_struct *task;
1079	struct mm_struct *mm;
1080	int err;
1081
1082	/* Check flags */
1083	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
1084		return -EINVAL;
1085
1086	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1087		return -EPERM;
1088
1089	/* Find the mm_struct */
1090	read_lock(&tasklist_lock);
1091	task = pid ? find_task_by_vpid(pid) : current;
1092	if (!task) {
1093		read_unlock(&tasklist_lock);
1094		return -ESRCH;
1095	}
1096	mm = get_task_mm(task);
1097	read_unlock(&tasklist_lock);
1098
1099	if (!mm)
1100		return -EINVAL;
1101
1102	/*
1103	 * Check if this process has the right to modify the specified
1104	 * process. The right exists if the process has administrative
1105	 * capabilities, superuser privileges or the same
1106	 * userid as the target process.
1107	 */
1108	rcu_read_lock();
1109	tcred = __task_cred(task);
1110	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1111	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
1112	    !capable(CAP_SYS_NICE)) {
1113		rcu_read_unlock();
1114		err = -EPERM;
1115		goto out;
1116	}
1117	rcu_read_unlock();
1118
1119 	err = security_task_movememory(task);
1120 	if (err)
1121		goto out;
1122
1123	if (nodes) {
1124		err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
1125				    flags);
1126	} else {
1127		err = do_pages_stat(mm, nr_pages, pages, status);
1128	}
1129
1130out:
1131	mmput(mm);
1132	return err;
1133}
1134
1135/*
1136 * Call migration functions in the vma_ops that may prepare
1137 * memory in a vm for migration. migration functions may perform
1138 * the migration for vmas that do not have an underlying page struct.
1139 */
1140int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1141	const nodemask_t *from, unsigned long flags)
1142{
1143 	struct vm_area_struct *vma;
1144 	int err = 0;
1145
1146 	for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) {
1147 		if (vma->vm_ops && vma->vm_ops->migrate) {
1148 			err = vma->vm_ops->migrate(vma, to, from, flags);
1149 			if (err)
1150 				break;
1151 		}
1152 	}
1153 	return err;
1154}
1155#endif
1156