migrate.c revision 529ae9aaa08378cfe2a4350bded76f32cc8ff0ce
1/*
2 * Memory Migration functionality - linux/mm/migration.c
3 *
4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
5 *
6 * Page migration was first developed in the context of the memory hotplug
7 * project. The main authors of the migration code are:
8 *
9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
10 * Hirokazu Takahashi <taka@valinux.co.jp>
11 * Dave Hansen <haveblue@us.ibm.com>
12 * Christoph Lameter
13 */
14
15#include <linux/migrate.h>
16#include <linux/module.h>
17#include <linux/swap.h>
18#include <linux/swapops.h>
19#include <linux/pagemap.h>
20#include <linux/buffer_head.h>
21#include <linux/mm_inline.h>
22#include <linux/nsproxy.h>
23#include <linux/pagevec.h>
24#include <linux/rmap.h>
25#include <linux/topology.h>
26#include <linux/cpu.h>
27#include <linux/cpuset.h>
28#include <linux/writeback.h>
29#include <linux/mempolicy.h>
30#include <linux/vmalloc.h>
31#include <linux/security.h>
32#include <linux/memcontrol.h>
33#include <linux/syscalls.h>
34
35#include "internal.h"
36
37#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
38
39/*
40 * Isolate one page from the LRU lists. If successful put it onto
41 * the indicated list with elevated page count.
42 *
43 * Result:
44 *  -EBUSY: page not on LRU list
45 *  0: page removed from LRU list and added to the specified list.
46 */
47int isolate_lru_page(struct page *page, struct list_head *pagelist)
48{
49	int ret = -EBUSY;
50
51	if (PageLRU(page)) {
52		struct zone *zone = page_zone(page);
53
54		spin_lock_irq(&zone->lru_lock);
55		if (PageLRU(page) && get_page_unless_zero(page)) {
56			ret = 0;
57			ClearPageLRU(page);
58			if (PageActive(page))
59				del_page_from_active_list(zone, page);
60			else
61				del_page_from_inactive_list(zone, page);
62			list_add_tail(&page->lru, pagelist);
63		}
64		spin_unlock_irq(&zone->lru_lock);
65	}
66	return ret;
67}
68
69/*
70 * migrate_prep() needs to be called before we start compiling a list of pages
71 * to be migrated using isolate_lru_page().
72 */
73int migrate_prep(void)
74{
75	/*
76	 * Clear the LRU lists so pages can be isolated.
77	 * Note that pages may be moved off the LRU after we have
78	 * drained them. Those pages will fail to migrate like other
79	 * pages that may be busy.
80	 */
81	lru_add_drain_all();
82
83	return 0;
84}
85
86static inline void move_to_lru(struct page *page)
87{
88	if (PageActive(page)) {
89		/*
90		 * lru_cache_add_active checks that
91		 * the PG_active bit is off.
92		 */
93		ClearPageActive(page);
94		lru_cache_add_active(page);
95	} else {
96		lru_cache_add(page);
97	}
98	put_page(page);
99}
100
101/*
102 * Add isolated pages on the list back to the LRU.
103 *
104 * returns the number of pages put back.
105 */
106int putback_lru_pages(struct list_head *l)
107{
108	struct page *page;
109	struct page *page2;
110	int count = 0;
111
112	list_for_each_entry_safe(page, page2, l, lru) {
113		list_del(&page->lru);
114		move_to_lru(page);
115		count++;
116	}
117	return count;
118}
119
120/*
121 * Restore a potential migration pte to a working pte entry
122 */
123static void remove_migration_pte(struct vm_area_struct *vma,
124		struct page *old, struct page *new)
125{
126	struct mm_struct *mm = vma->vm_mm;
127	swp_entry_t entry;
128 	pgd_t *pgd;
129 	pud_t *pud;
130 	pmd_t *pmd;
131	pte_t *ptep, pte;
132 	spinlock_t *ptl;
133	unsigned long addr = page_address_in_vma(new, vma);
134
135	if (addr == -EFAULT)
136		return;
137
138 	pgd = pgd_offset(mm, addr);
139	if (!pgd_present(*pgd))
140                return;
141
142	pud = pud_offset(pgd, addr);
143	if (!pud_present(*pud))
144                return;
145
146	pmd = pmd_offset(pud, addr);
147	if (!pmd_present(*pmd))
148		return;
149
150	ptep = pte_offset_map(pmd, addr);
151
152	if (!is_swap_pte(*ptep)) {
153		pte_unmap(ptep);
154 		return;
155 	}
156
157 	ptl = pte_lockptr(mm, pmd);
158 	spin_lock(ptl);
159	pte = *ptep;
160	if (!is_swap_pte(pte))
161		goto out;
162
163	entry = pte_to_swp_entry(pte);
164
165	if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
166		goto out;
167
168	/*
169	 * Yes, ignore the return value from a GFP_ATOMIC mem_cgroup_charge.
170	 * Failure is not an option here: we're now expected to remove every
171	 * migration pte, and will cause crashes otherwise.  Normally this
172	 * is not an issue: mem_cgroup_prepare_migration bumped up the old
173	 * page_cgroup count for safety, that's now attached to the new page,
174	 * so this charge should just be another incrementation of the count,
175	 * to keep in balance with rmap.c's mem_cgroup_uncharging.  But if
176	 * there's been a force_empty, those reference counts may no longer
177	 * be reliable, and this charge can actually fail: oh well, we don't
178	 * make the situation any worse by proceeding as if it had succeeded.
179	 */
180	mem_cgroup_charge(new, mm, GFP_ATOMIC);
181
182	get_page(new);
183	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
184	if (is_write_migration_entry(entry))
185		pte = pte_mkwrite(pte);
186	flush_cache_page(vma, addr, pte_pfn(pte));
187	set_pte_at(mm, addr, ptep, pte);
188
189	if (PageAnon(new))
190		page_add_anon_rmap(new, vma, addr);
191	else
192		page_add_file_rmap(new);
193
194	/* No need to invalidate - it was non-present before */
195	update_mmu_cache(vma, addr, pte);
196
197out:
198	pte_unmap_unlock(ptep, ptl);
199}
200
201/*
202 * Note that remove_file_migration_ptes will only work on regular mappings,
203 * Nonlinear mappings do not use migration entries.
204 */
205static void remove_file_migration_ptes(struct page *old, struct page *new)
206{
207	struct vm_area_struct *vma;
208	struct address_space *mapping = page_mapping(new);
209	struct prio_tree_iter iter;
210	pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
211
212	if (!mapping)
213		return;
214
215	spin_lock(&mapping->i_mmap_lock);
216
217	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
218		remove_migration_pte(vma, old, new);
219
220	spin_unlock(&mapping->i_mmap_lock);
221}
222
223/*
224 * Must hold mmap_sem lock on at least one of the vmas containing
225 * the page so that the anon_vma cannot vanish.
226 */
227static void remove_anon_migration_ptes(struct page *old, struct page *new)
228{
229	struct anon_vma *anon_vma;
230	struct vm_area_struct *vma;
231	unsigned long mapping;
232
233	mapping = (unsigned long)new->mapping;
234
235	if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
236		return;
237
238	/*
239	 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
240	 */
241	anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
242	spin_lock(&anon_vma->lock);
243
244	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
245		remove_migration_pte(vma, old, new);
246
247	spin_unlock(&anon_vma->lock);
248}
249
250/*
251 * Get rid of all migration entries and replace them by
252 * references to the indicated page.
253 */
254static void remove_migration_ptes(struct page *old, struct page *new)
255{
256	if (PageAnon(new))
257		remove_anon_migration_ptes(old, new);
258	else
259		remove_file_migration_ptes(old, new);
260}
261
262/*
263 * Something used the pte of a page under migration. We need to
264 * get to the page and wait until migration is finished.
265 * When we return from this function the fault will be retried.
266 *
267 * This function is called from do_swap_page().
268 */
269void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
270				unsigned long address)
271{
272	pte_t *ptep, pte;
273	spinlock_t *ptl;
274	swp_entry_t entry;
275	struct page *page;
276
277	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
278	pte = *ptep;
279	if (!is_swap_pte(pte))
280		goto out;
281
282	entry = pte_to_swp_entry(pte);
283	if (!is_migration_entry(entry))
284		goto out;
285
286	page = migration_entry_to_page(entry);
287
288	/*
289	 * Once radix-tree replacement of page migration started, page_count
290	 * *must* be zero. And, we don't want to call wait_on_page_locked()
291	 * against a page without get_page().
292	 * So, we use get_page_unless_zero(), here. Even failed, page fault
293	 * will occur again.
294	 */
295	if (!get_page_unless_zero(page))
296		goto out;
297	pte_unmap_unlock(ptep, ptl);
298	wait_on_page_locked(page);
299	put_page(page);
300	return;
301out:
302	pte_unmap_unlock(ptep, ptl);
303}
304
305/*
306 * Replace the page in the mapping.
307 *
308 * The number of remaining references must be:
309 * 1 for anonymous pages without a mapping
310 * 2 for pages with a mapping
311 * 3 for pages with a mapping and PagePrivate set.
312 */
313static int migrate_page_move_mapping(struct address_space *mapping,
314		struct page *newpage, struct page *page)
315{
316	int expected_count;
317	void **pslot;
318
319	if (!mapping) {
320		/* Anonymous page without mapping */
321		if (page_count(page) != 1)
322			return -EAGAIN;
323		return 0;
324	}
325
326	spin_lock_irq(&mapping->tree_lock);
327
328	pslot = radix_tree_lookup_slot(&mapping->page_tree,
329 					page_index(page));
330
331	expected_count = 2 + !!PagePrivate(page);
332	if (page_count(page) != expected_count ||
333			(struct page *)radix_tree_deref_slot(pslot) != page) {
334		spin_unlock_irq(&mapping->tree_lock);
335		return -EAGAIN;
336	}
337
338	if (!page_freeze_refs(page, expected_count)) {
339		spin_unlock_irq(&mapping->tree_lock);
340		return -EAGAIN;
341	}
342
343	/*
344	 * Now we know that no one else is looking at the page.
345	 */
346	get_page(newpage);	/* add cache reference */
347#ifdef CONFIG_SWAP
348	if (PageSwapCache(page)) {
349		SetPageSwapCache(newpage);
350		set_page_private(newpage, page_private(page));
351	}
352#endif
353
354	radix_tree_replace_slot(pslot, newpage);
355
356	page_unfreeze_refs(page, expected_count);
357	/*
358	 * Drop cache reference from old page.
359	 * We know this isn't the last reference.
360	 */
361	__put_page(page);
362
363	/*
364	 * If moved to a different zone then also account
365	 * the page for that zone. Other VM counters will be
366	 * taken care of when we establish references to the
367	 * new page and drop references to the old page.
368	 *
369	 * Note that anonymous pages are accounted for
370	 * via NR_FILE_PAGES and NR_ANON_PAGES if they
371	 * are mapped to swap space.
372	 */
373	__dec_zone_page_state(page, NR_FILE_PAGES);
374	__inc_zone_page_state(newpage, NR_FILE_PAGES);
375
376	spin_unlock_irq(&mapping->tree_lock);
377	if (!PageSwapCache(newpage))
378		mem_cgroup_uncharge_cache_page(page);
379
380	return 0;
381}
382
383/*
384 * Copy the page to its new location
385 */
386static void migrate_page_copy(struct page *newpage, struct page *page)
387{
388	copy_highpage(newpage, page);
389
390	if (PageError(page))
391		SetPageError(newpage);
392	if (PageReferenced(page))
393		SetPageReferenced(newpage);
394	if (PageUptodate(page))
395		SetPageUptodate(newpage);
396	if (PageActive(page))
397		SetPageActive(newpage);
398	if (PageChecked(page))
399		SetPageChecked(newpage);
400	if (PageMappedToDisk(page))
401		SetPageMappedToDisk(newpage);
402
403	if (PageDirty(page)) {
404		clear_page_dirty_for_io(page);
405		/*
406		 * Want to mark the page and the radix tree as dirty, and
407		 * redo the accounting that clear_page_dirty_for_io undid,
408		 * but we can't use set_page_dirty because that function
409		 * is actually a signal that all of the page has become dirty.
410		 * Wheras only part of our page may be dirty.
411		 */
412		__set_page_dirty_nobuffers(newpage);
413 	}
414
415#ifdef CONFIG_SWAP
416	ClearPageSwapCache(page);
417#endif
418	ClearPageActive(page);
419	ClearPagePrivate(page);
420	set_page_private(page, 0);
421	page->mapping = NULL;
422
423	/*
424	 * If any waiters have accumulated on the new page then
425	 * wake them up.
426	 */
427	if (PageWriteback(newpage))
428		end_page_writeback(newpage);
429}
430
431/************************************************************
432 *                    Migration functions
433 ***********************************************************/
434
435/* Always fail migration. Used for mappings that are not movable */
436int fail_migrate_page(struct address_space *mapping,
437			struct page *newpage, struct page *page)
438{
439	return -EIO;
440}
441EXPORT_SYMBOL(fail_migrate_page);
442
443/*
444 * Common logic to directly migrate a single page suitable for
445 * pages that do not use PagePrivate.
446 *
447 * Pages are locked upon entry and exit.
448 */
449int migrate_page(struct address_space *mapping,
450		struct page *newpage, struct page *page)
451{
452	int rc;
453
454	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
455
456	rc = migrate_page_move_mapping(mapping, newpage, page);
457
458	if (rc)
459		return rc;
460
461	migrate_page_copy(newpage, page);
462	return 0;
463}
464EXPORT_SYMBOL(migrate_page);
465
466#ifdef CONFIG_BLOCK
467/*
468 * Migration function for pages with buffers. This function can only be used
469 * if the underlying filesystem guarantees that no other references to "page"
470 * exist.
471 */
472int buffer_migrate_page(struct address_space *mapping,
473		struct page *newpage, struct page *page)
474{
475	struct buffer_head *bh, *head;
476	int rc;
477
478	if (!page_has_buffers(page))
479		return migrate_page(mapping, newpage, page);
480
481	head = page_buffers(page);
482
483	rc = migrate_page_move_mapping(mapping, newpage, page);
484
485	if (rc)
486		return rc;
487
488	bh = head;
489	do {
490		get_bh(bh);
491		lock_buffer(bh);
492		bh = bh->b_this_page;
493
494	} while (bh != head);
495
496	ClearPagePrivate(page);
497	set_page_private(newpage, page_private(page));
498	set_page_private(page, 0);
499	put_page(page);
500	get_page(newpage);
501
502	bh = head;
503	do {
504		set_bh_page(bh, newpage, bh_offset(bh));
505		bh = bh->b_this_page;
506
507	} while (bh != head);
508
509	SetPagePrivate(newpage);
510
511	migrate_page_copy(newpage, page);
512
513	bh = head;
514	do {
515		unlock_buffer(bh);
516 		put_bh(bh);
517		bh = bh->b_this_page;
518
519	} while (bh != head);
520
521	return 0;
522}
523EXPORT_SYMBOL(buffer_migrate_page);
524#endif
525
526/*
527 * Writeback a page to clean the dirty state
528 */
529static int writeout(struct address_space *mapping, struct page *page)
530{
531	struct writeback_control wbc = {
532		.sync_mode = WB_SYNC_NONE,
533		.nr_to_write = 1,
534		.range_start = 0,
535		.range_end = LLONG_MAX,
536		.nonblocking = 1,
537		.for_reclaim = 1
538	};
539	int rc;
540
541	if (!mapping->a_ops->writepage)
542		/* No write method for the address space */
543		return -EINVAL;
544
545	if (!clear_page_dirty_for_io(page))
546		/* Someone else already triggered a write */
547		return -EAGAIN;
548
549	/*
550	 * A dirty page may imply that the underlying filesystem has
551	 * the page on some queue. So the page must be clean for
552	 * migration. Writeout may mean we loose the lock and the
553	 * page state is no longer what we checked for earlier.
554	 * At this point we know that the migration attempt cannot
555	 * be successful.
556	 */
557	remove_migration_ptes(page, page);
558
559	rc = mapping->a_ops->writepage(page, &wbc);
560	if (rc < 0)
561		/* I/O Error writing */
562		return -EIO;
563
564	if (rc != AOP_WRITEPAGE_ACTIVATE)
565		/* unlocked. Relock */
566		lock_page(page);
567
568	return -EAGAIN;
569}
570
571/*
572 * Default handling if a filesystem does not provide a migration function.
573 */
574static int fallback_migrate_page(struct address_space *mapping,
575	struct page *newpage, struct page *page)
576{
577	if (PageDirty(page))
578		return writeout(mapping, page);
579
580	/*
581	 * Buffers may be managed in a filesystem specific way.
582	 * We must have no buffers or drop them.
583	 */
584	if (PagePrivate(page) &&
585	    !try_to_release_page(page, GFP_KERNEL))
586		return -EAGAIN;
587
588	return migrate_page(mapping, newpage, page);
589}
590
591/*
592 * Move a page to a newly allocated page
593 * The page is locked and all ptes have been successfully removed.
594 *
595 * The new page will have replaced the old page if this function
596 * is successful.
597 */
598static int move_to_new_page(struct page *newpage, struct page *page)
599{
600	struct address_space *mapping;
601	int rc;
602
603	/*
604	 * Block others from accessing the page when we get around to
605	 * establishing additional references. We are the only one
606	 * holding a reference to the new page at this point.
607	 */
608	if (!trylock_page(newpage))
609		BUG();
610
611	/* Prepare mapping for the new page.*/
612	newpage->index = page->index;
613	newpage->mapping = page->mapping;
614
615	mapping = page_mapping(page);
616	if (!mapping)
617		rc = migrate_page(mapping, newpage, page);
618	else if (mapping->a_ops->migratepage)
619		/*
620		 * Most pages have a mapping and most filesystems
621		 * should provide a migration function. Anonymous
622		 * pages are part of swap space which also has its
623		 * own migration function. This is the most common
624		 * path for page migration.
625		 */
626		rc = mapping->a_ops->migratepage(mapping,
627						newpage, page);
628	else
629		rc = fallback_migrate_page(mapping, newpage, page);
630
631	if (!rc) {
632		remove_migration_ptes(page, newpage);
633	} else
634		newpage->mapping = NULL;
635
636	unlock_page(newpage);
637
638	return rc;
639}
640
641/*
642 * Obtain the lock on page, remove all ptes and migrate the page
643 * to the newly allocated page in newpage.
644 */
645static int unmap_and_move(new_page_t get_new_page, unsigned long private,
646			struct page *page, int force)
647{
648	int rc = 0;
649	int *result = NULL;
650	struct page *newpage = get_new_page(page, private, &result);
651	int rcu_locked = 0;
652	int charge = 0;
653
654	if (!newpage)
655		return -ENOMEM;
656
657	if (page_count(page) == 1)
658		/* page was freed from under us. So we are done. */
659		goto move_newpage;
660
661	charge = mem_cgroup_prepare_migration(page, newpage);
662	if (charge == -ENOMEM) {
663		rc = -ENOMEM;
664		goto move_newpage;
665	}
666	/* prepare cgroup just returns 0 or -ENOMEM */
667	BUG_ON(charge);
668
669	rc = -EAGAIN;
670	if (!trylock_page(page)) {
671		if (!force)
672			goto move_newpage;
673		lock_page(page);
674	}
675
676	if (PageWriteback(page)) {
677		if (!force)
678			goto unlock;
679		wait_on_page_writeback(page);
680	}
681	/*
682	 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
683	 * we cannot notice that anon_vma is freed while we migrates a page.
684	 * This rcu_read_lock() delays freeing anon_vma pointer until the end
685	 * of migration. File cache pages are no problem because of page_lock()
686	 * File Caches may use write_page() or lock_page() in migration, then,
687	 * just care Anon page here.
688	 */
689	if (PageAnon(page)) {
690		rcu_read_lock();
691		rcu_locked = 1;
692	}
693
694	/*
695	 * Corner case handling:
696	 * 1. When a new swap-cache page is read into, it is added to the LRU
697	 * and treated as swapcache but it has no rmap yet.
698	 * Calling try_to_unmap() against a page->mapping==NULL page will
699	 * trigger a BUG.  So handle it here.
700	 * 2. An orphaned page (see truncate_complete_page) might have
701	 * fs-private metadata. The page can be picked up due to memory
702	 * offlining.  Everywhere else except page reclaim, the page is
703	 * invisible to the vm, so the page can not be migrated.  So try to
704	 * free the metadata, so the page can be freed.
705	 */
706	if (!page->mapping) {
707		if (!PageAnon(page) && PagePrivate(page)) {
708			/*
709			 * Go direct to try_to_free_buffers() here because
710			 * a) that's what try_to_release_page() would do anyway
711			 * b) we may be under rcu_read_lock() here, so we can't
712			 *    use GFP_KERNEL which is what try_to_release_page()
713			 *    needs to be effective.
714			 */
715			try_to_free_buffers(page);
716		}
717		goto rcu_unlock;
718	}
719
720	/* Establish migration ptes or remove ptes */
721	try_to_unmap(page, 1);
722
723	if (!page_mapped(page))
724		rc = move_to_new_page(newpage, page);
725
726	if (rc)
727		remove_migration_ptes(page, page);
728rcu_unlock:
729	if (rcu_locked)
730		rcu_read_unlock();
731
732unlock:
733
734	unlock_page(page);
735
736	if (rc != -EAGAIN) {
737 		/*
738 		 * A page that has been migrated has all references
739 		 * removed and will be freed. A page that has not been
740 		 * migrated will have kepts its references and be
741 		 * restored.
742 		 */
743 		list_del(&page->lru);
744 		move_to_lru(page);
745	}
746
747move_newpage:
748	if (!charge)
749		mem_cgroup_end_migration(newpage);
750	/*
751	 * Move the new page to the LRU. If migration was not successful
752	 * then this will free the page.
753	 */
754	move_to_lru(newpage);
755	if (result) {
756		if (rc)
757			*result = rc;
758		else
759			*result = page_to_nid(newpage);
760	}
761	return rc;
762}
763
764/*
765 * migrate_pages
766 *
767 * The function takes one list of pages to migrate and a function
768 * that determines from the page to be migrated and the private data
769 * the target of the move and allocates the page.
770 *
771 * The function returns after 10 attempts or if no pages
772 * are movable anymore because to has become empty
773 * or no retryable pages exist anymore. All pages will be
774 * returned to the LRU or freed.
775 *
776 * Return: Number of pages not migrated or error code.
777 */
778int migrate_pages(struct list_head *from,
779		new_page_t get_new_page, unsigned long private)
780{
781	int retry = 1;
782	int nr_failed = 0;
783	int pass = 0;
784	struct page *page;
785	struct page *page2;
786	int swapwrite = current->flags & PF_SWAPWRITE;
787	int rc;
788
789	if (!swapwrite)
790		current->flags |= PF_SWAPWRITE;
791
792	for(pass = 0; pass < 10 && retry; pass++) {
793		retry = 0;
794
795		list_for_each_entry_safe(page, page2, from, lru) {
796			cond_resched();
797
798			rc = unmap_and_move(get_new_page, private,
799						page, pass > 2);
800
801			switch(rc) {
802			case -ENOMEM:
803				goto out;
804			case -EAGAIN:
805				retry++;
806				break;
807			case 0:
808				break;
809			default:
810				/* Permanent failure */
811				nr_failed++;
812				break;
813			}
814		}
815	}
816	rc = 0;
817out:
818	if (!swapwrite)
819		current->flags &= ~PF_SWAPWRITE;
820
821	putback_lru_pages(from);
822
823	if (rc)
824		return rc;
825
826	return nr_failed + retry;
827}
828
829#ifdef CONFIG_NUMA
830/*
831 * Move a list of individual pages
832 */
833struct page_to_node {
834	unsigned long addr;
835	struct page *page;
836	int node;
837	int status;
838};
839
840static struct page *new_page_node(struct page *p, unsigned long private,
841		int **result)
842{
843	struct page_to_node *pm = (struct page_to_node *)private;
844
845	while (pm->node != MAX_NUMNODES && pm->page != p)
846		pm++;
847
848	if (pm->node == MAX_NUMNODES)
849		return NULL;
850
851	*result = &pm->status;
852
853	return alloc_pages_node(pm->node,
854				GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
855}
856
857/*
858 * Move a set of pages as indicated in the pm array. The addr
859 * field must be set to the virtual address of the page to be moved
860 * and the node number must contain a valid target node.
861 */
862static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
863				int migrate_all)
864{
865	int err;
866	struct page_to_node *pp;
867	LIST_HEAD(pagelist);
868
869	down_read(&mm->mmap_sem);
870
871	/*
872	 * Build a list of pages to migrate
873	 */
874	migrate_prep();
875	for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
876		struct vm_area_struct *vma;
877		struct page *page;
878
879		/*
880		 * A valid page pointer that will not match any of the
881		 * pages that will be moved.
882		 */
883		pp->page = ZERO_PAGE(0);
884
885		err = -EFAULT;
886		vma = find_vma(mm, pp->addr);
887		if (!vma || !vma_migratable(vma))
888			goto set_status;
889
890		page = follow_page(vma, pp->addr, FOLL_GET);
891
892		err = PTR_ERR(page);
893		if (IS_ERR(page))
894			goto set_status;
895
896		err = -ENOENT;
897		if (!page)
898			goto set_status;
899
900		if (PageReserved(page))		/* Check for zero page */
901			goto put_and_set;
902
903		pp->page = page;
904		err = page_to_nid(page);
905
906		if (err == pp->node)
907			/*
908			 * Node already in the right place
909			 */
910			goto put_and_set;
911
912		err = -EACCES;
913		if (page_mapcount(page) > 1 &&
914				!migrate_all)
915			goto put_and_set;
916
917		err = isolate_lru_page(page, &pagelist);
918put_and_set:
919		/*
920		 * Either remove the duplicate refcount from
921		 * isolate_lru_page() or drop the page ref if it was
922		 * not isolated.
923		 */
924		put_page(page);
925set_status:
926		pp->status = err;
927	}
928
929	if (!list_empty(&pagelist))
930		err = migrate_pages(&pagelist, new_page_node,
931				(unsigned long)pm);
932	else
933		err = -ENOENT;
934
935	up_read(&mm->mmap_sem);
936	return err;
937}
938
939/*
940 * Determine the nodes of a list of pages. The addr in the pm array
941 * must have been set to the virtual address of which we want to determine
942 * the node number.
943 */
944static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
945{
946	down_read(&mm->mmap_sem);
947
948	for ( ; pm->node != MAX_NUMNODES; pm++) {
949		struct vm_area_struct *vma;
950		struct page *page;
951		int err;
952
953		err = -EFAULT;
954		vma = find_vma(mm, pm->addr);
955		if (!vma)
956			goto set_status;
957
958		page = follow_page(vma, pm->addr, 0);
959
960		err = PTR_ERR(page);
961		if (IS_ERR(page))
962			goto set_status;
963
964		err = -ENOENT;
965		/* Use PageReserved to check for zero page */
966		if (!page || PageReserved(page))
967			goto set_status;
968
969		err = page_to_nid(page);
970set_status:
971		pm->status = err;
972	}
973
974	up_read(&mm->mmap_sem);
975	return 0;
976}
977
978/*
979 * Move a list of pages in the address space of the currently executing
980 * process.
981 */
982asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
983			const void __user * __user *pages,
984			const int __user *nodes,
985			int __user *status, int flags)
986{
987	int err = 0;
988	int i;
989	struct task_struct *task;
990	nodemask_t task_nodes;
991	struct mm_struct *mm;
992	struct page_to_node *pm = NULL;
993
994	/* Check flags */
995	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
996		return -EINVAL;
997
998	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
999		return -EPERM;
1000
1001	/* Find the mm_struct */
1002	read_lock(&tasklist_lock);
1003	task = pid ? find_task_by_vpid(pid) : current;
1004	if (!task) {
1005		read_unlock(&tasklist_lock);
1006		return -ESRCH;
1007	}
1008	mm = get_task_mm(task);
1009	read_unlock(&tasklist_lock);
1010
1011	if (!mm)
1012		return -EINVAL;
1013
1014	/*
1015	 * Check if this process has the right to modify the specified
1016	 * process. The right exists if the process has administrative
1017	 * capabilities, superuser privileges or the same
1018	 * userid as the target process.
1019	 */
1020	if ((current->euid != task->suid) && (current->euid != task->uid) &&
1021	    (current->uid != task->suid) && (current->uid != task->uid) &&
1022	    !capable(CAP_SYS_NICE)) {
1023		err = -EPERM;
1024		goto out2;
1025	}
1026
1027 	err = security_task_movememory(task);
1028 	if (err)
1029 		goto out2;
1030
1031
1032	task_nodes = cpuset_mems_allowed(task);
1033
1034	/* Limit nr_pages so that the multiplication may not overflow */
1035	if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
1036		err = -E2BIG;
1037		goto out2;
1038	}
1039
1040	pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
1041	if (!pm) {
1042		err = -ENOMEM;
1043		goto out2;
1044	}
1045
1046	/*
1047	 * Get parameters from user space and initialize the pm
1048	 * array. Return various errors if the user did something wrong.
1049	 */
1050	for (i = 0; i < nr_pages; i++) {
1051		const void __user *p;
1052
1053		err = -EFAULT;
1054		if (get_user(p, pages + i))
1055			goto out;
1056
1057		pm[i].addr = (unsigned long)p;
1058		if (nodes) {
1059			int node;
1060
1061			if (get_user(node, nodes + i))
1062				goto out;
1063
1064			err = -ENODEV;
1065			if (!node_state(node, N_HIGH_MEMORY))
1066				goto out;
1067
1068			err = -EACCES;
1069			if (!node_isset(node, task_nodes))
1070				goto out;
1071
1072			pm[i].node = node;
1073		} else
1074			pm[i].node = 0;	/* anything to not match MAX_NUMNODES */
1075	}
1076	/* End marker */
1077	pm[nr_pages].node = MAX_NUMNODES;
1078
1079	if (nodes)
1080		err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL);
1081	else
1082		err = do_pages_stat(mm, pm);
1083
1084	if (err >= 0)
1085		/* Return status information */
1086		for (i = 0; i < nr_pages; i++)
1087			if (put_user(pm[i].status, status + i))
1088				err = -EFAULT;
1089
1090out:
1091	vfree(pm);
1092out2:
1093	mmput(mm);
1094	return err;
1095}
1096
1097/*
1098 * Call migration functions in the vma_ops that may prepare
1099 * memory in a vm for migration. migration functions may perform
1100 * the migration for vmas that do not have an underlying page struct.
1101 */
1102int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1103	const nodemask_t *from, unsigned long flags)
1104{
1105 	struct vm_area_struct *vma;
1106 	int err = 0;
1107
1108 	for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) {
1109 		if (vma->vm_ops && vma->vm_ops->migrate) {
1110 			err = vma->vm_ops->migrate(vma, to, from, flags);
1111 			if (err)
1112 				break;
1113 		}
1114 	}
1115 	return err;
1116}
1117#endif
1118