migrate.c revision 698dd4ba6b12e34e1e432c944c01478c0b2cd773
1/* 2 * Memory Migration functionality - linux/mm/migration.c 3 * 4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter 5 * 6 * Page migration was first developed in the context of the memory hotplug 7 * project. The main authors of the migration code are: 8 * 9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> 10 * Hirokazu Takahashi <taka@valinux.co.jp> 11 * Dave Hansen <haveblue@us.ibm.com> 12 * Christoph Lameter <clameter@sgi.com> 13 */ 14 15#include <linux/migrate.h> 16#include <linux/module.h> 17#include <linux/swap.h> 18#include <linux/swapops.h> 19#include <linux/pagemap.h> 20#include <linux/buffer_head.h> 21#include <linux/mm_inline.h> 22#include <linux/nsproxy.h> 23#include <linux/pagevec.h> 24#include <linux/rmap.h> 25#include <linux/topology.h> 26#include <linux/cpu.h> 27#include <linux/cpuset.h> 28#include <linux/writeback.h> 29#include <linux/mempolicy.h> 30#include <linux/vmalloc.h> 31#include <linux/security.h> 32 33#include "internal.h" 34 35#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 36 37/* 38 * Isolate one page from the LRU lists. If successful put it onto 39 * the indicated list with elevated page count. 40 * 41 * Result: 42 * -EBUSY: page not on LRU list 43 * 0: page removed from LRU list and added to the specified list. 44 */ 45int isolate_lru_page(struct page *page, struct list_head *pagelist) 46{ 47 int ret = -EBUSY; 48 49 if (PageLRU(page)) { 50 struct zone *zone = page_zone(page); 51 52 spin_lock_irq(&zone->lru_lock); 53 if (PageLRU(page) && get_page_unless_zero(page)) { 54 ret = 0; 55 ClearPageLRU(page); 56 if (PageActive(page)) 57 del_page_from_active_list(zone, page); 58 else 59 del_page_from_inactive_list(zone, page); 60 list_add_tail(&page->lru, pagelist); 61 } 62 spin_unlock_irq(&zone->lru_lock); 63 } 64 return ret; 65} 66 67/* 68 * migrate_prep() needs to be called before we start compiling a list of pages 69 * to be migrated using isolate_lru_page(). 70 */ 71int migrate_prep(void) 72{ 73 /* 74 * Clear the LRU lists so pages can be isolated. 75 * Note that pages may be moved off the LRU after we have 76 * drained them. Those pages will fail to migrate like other 77 * pages that may be busy. 78 */ 79 lru_add_drain_all(); 80 81 return 0; 82} 83 84static inline void move_to_lru(struct page *page) 85{ 86 if (PageActive(page)) { 87 /* 88 * lru_cache_add_active checks that 89 * the PG_active bit is off. 90 */ 91 ClearPageActive(page); 92 lru_cache_add_active(page); 93 } else { 94 lru_cache_add(page); 95 } 96 put_page(page); 97} 98 99/* 100 * Add isolated pages on the list back to the LRU. 101 * 102 * returns the number of pages put back. 103 */ 104int putback_lru_pages(struct list_head *l) 105{ 106 struct page *page; 107 struct page *page2; 108 int count = 0; 109 110 list_for_each_entry_safe(page, page2, l, lru) { 111 list_del(&page->lru); 112 move_to_lru(page); 113 count++; 114 } 115 return count; 116} 117 118/* 119 * Restore a potential migration pte to a working pte entry 120 */ 121static void remove_migration_pte(struct vm_area_struct *vma, 122 struct page *old, struct page *new) 123{ 124 struct mm_struct *mm = vma->vm_mm; 125 swp_entry_t entry; 126 pgd_t *pgd; 127 pud_t *pud; 128 pmd_t *pmd; 129 pte_t *ptep, pte; 130 spinlock_t *ptl; 131 unsigned long addr = page_address_in_vma(new, vma); 132 133 if (addr == -EFAULT) 134 return; 135 136 pgd = pgd_offset(mm, addr); 137 if (!pgd_present(*pgd)) 138 return; 139 140 pud = pud_offset(pgd, addr); 141 if (!pud_present(*pud)) 142 return; 143 144 pmd = pmd_offset(pud, addr); 145 if (!pmd_present(*pmd)) 146 return; 147 148 ptep = pte_offset_map(pmd, addr); 149 150 if (!is_swap_pte(*ptep)) { 151 pte_unmap(ptep); 152 return; 153 } 154 155 ptl = pte_lockptr(mm, pmd); 156 spin_lock(ptl); 157 pte = *ptep; 158 if (!is_swap_pte(pte)) 159 goto out; 160 161 entry = pte_to_swp_entry(pte); 162 163 if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) 164 goto out; 165 166 get_page(new); 167 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 168 if (is_write_migration_entry(entry)) 169 pte = pte_mkwrite(pte); 170 flush_cache_page(vma, addr, pte_pfn(pte)); 171 set_pte_at(mm, addr, ptep, pte); 172 173 if (PageAnon(new)) 174 page_add_anon_rmap(new, vma, addr); 175 else 176 page_add_file_rmap(new); 177 178 /* No need to invalidate - it was non-present before */ 179 update_mmu_cache(vma, addr, pte); 180 181out: 182 pte_unmap_unlock(ptep, ptl); 183} 184 185/* 186 * Note that remove_file_migration_ptes will only work on regular mappings, 187 * Nonlinear mappings do not use migration entries. 188 */ 189static void remove_file_migration_ptes(struct page *old, struct page *new) 190{ 191 struct vm_area_struct *vma; 192 struct address_space *mapping = page_mapping(new); 193 struct prio_tree_iter iter; 194 pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 195 196 if (!mapping) 197 return; 198 199 spin_lock(&mapping->i_mmap_lock); 200 201 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) 202 remove_migration_pte(vma, old, new); 203 204 spin_unlock(&mapping->i_mmap_lock); 205} 206 207/* 208 * Must hold mmap_sem lock on at least one of the vmas containing 209 * the page so that the anon_vma cannot vanish. 210 */ 211static void remove_anon_migration_ptes(struct page *old, struct page *new) 212{ 213 struct anon_vma *anon_vma; 214 struct vm_area_struct *vma; 215 unsigned long mapping; 216 217 mapping = (unsigned long)new->mapping; 218 219 if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) 220 return; 221 222 /* 223 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. 224 */ 225 anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); 226 spin_lock(&anon_vma->lock); 227 228 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) 229 remove_migration_pte(vma, old, new); 230 231 spin_unlock(&anon_vma->lock); 232} 233 234/* 235 * Get rid of all migration entries and replace them by 236 * references to the indicated page. 237 */ 238static void remove_migration_ptes(struct page *old, struct page *new) 239{ 240 if (PageAnon(new)) 241 remove_anon_migration_ptes(old, new); 242 else 243 remove_file_migration_ptes(old, new); 244} 245 246/* 247 * Something used the pte of a page under migration. We need to 248 * get to the page and wait until migration is finished. 249 * When we return from this function the fault will be retried. 250 * 251 * This function is called from do_swap_page(). 252 */ 253void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, 254 unsigned long address) 255{ 256 pte_t *ptep, pte; 257 spinlock_t *ptl; 258 swp_entry_t entry; 259 struct page *page; 260 261 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 262 pte = *ptep; 263 if (!is_swap_pte(pte)) 264 goto out; 265 266 entry = pte_to_swp_entry(pte); 267 if (!is_migration_entry(entry)) 268 goto out; 269 270 page = migration_entry_to_page(entry); 271 272 get_page(page); 273 pte_unmap_unlock(ptep, ptl); 274 wait_on_page_locked(page); 275 put_page(page); 276 return; 277out: 278 pte_unmap_unlock(ptep, ptl); 279} 280 281/* 282 * Replace the page in the mapping. 283 * 284 * The number of remaining references must be: 285 * 1 for anonymous pages without a mapping 286 * 2 for pages with a mapping 287 * 3 for pages with a mapping and PagePrivate set. 288 */ 289static int migrate_page_move_mapping(struct address_space *mapping, 290 struct page *newpage, struct page *page) 291{ 292 void **pslot; 293 294 if (!mapping) { 295 /* Anonymous page without mapping */ 296 if (page_count(page) != 1) 297 return -EAGAIN; 298 return 0; 299 } 300 301 write_lock_irq(&mapping->tree_lock); 302 303 pslot = radix_tree_lookup_slot(&mapping->page_tree, 304 page_index(page)); 305 306 if (page_count(page) != 2 + !!PagePrivate(page) || 307 (struct page *)radix_tree_deref_slot(pslot) != page) { 308 write_unlock_irq(&mapping->tree_lock); 309 return -EAGAIN; 310 } 311 312 /* 313 * Now we know that no one else is looking at the page. 314 */ 315 get_page(newpage); /* add cache reference */ 316#ifdef CONFIG_SWAP 317 if (PageSwapCache(page)) { 318 SetPageSwapCache(newpage); 319 set_page_private(newpage, page_private(page)); 320 } 321#endif 322 323 radix_tree_replace_slot(pslot, newpage); 324 325 /* 326 * Drop cache reference from old page. 327 * We know this isn't the last reference. 328 */ 329 __put_page(page); 330 331 /* 332 * If moved to a different zone then also account 333 * the page for that zone. Other VM counters will be 334 * taken care of when we establish references to the 335 * new page and drop references to the old page. 336 * 337 * Note that anonymous pages are accounted for 338 * via NR_FILE_PAGES and NR_ANON_PAGES if they 339 * are mapped to swap space. 340 */ 341 __dec_zone_page_state(page, NR_FILE_PAGES); 342 __inc_zone_page_state(newpage, NR_FILE_PAGES); 343 344 write_unlock_irq(&mapping->tree_lock); 345 346 return 0; 347} 348 349/* 350 * Copy the page to its new location 351 */ 352static void migrate_page_copy(struct page *newpage, struct page *page) 353{ 354 copy_highpage(newpage, page); 355 356 if (PageError(page)) 357 SetPageError(newpage); 358 if (PageReferenced(page)) 359 SetPageReferenced(newpage); 360 if (PageUptodate(page)) 361 SetPageUptodate(newpage); 362 if (PageActive(page)) 363 SetPageActive(newpage); 364 if (PageChecked(page)) 365 SetPageChecked(newpage); 366 if (PageMappedToDisk(page)) 367 SetPageMappedToDisk(newpage); 368 369 if (PageDirty(page)) { 370 clear_page_dirty_for_io(page); 371 set_page_dirty(newpage); 372 } 373 374#ifdef CONFIG_SWAP 375 ClearPageSwapCache(page); 376#endif 377 ClearPageActive(page); 378 ClearPagePrivate(page); 379 set_page_private(page, 0); 380 page->mapping = NULL; 381 382 /* 383 * If any waiters have accumulated on the new page then 384 * wake them up. 385 */ 386 if (PageWriteback(newpage)) 387 end_page_writeback(newpage); 388} 389 390/************************************************************ 391 * Migration functions 392 ***********************************************************/ 393 394/* Always fail migration. Used for mappings that are not movable */ 395int fail_migrate_page(struct address_space *mapping, 396 struct page *newpage, struct page *page) 397{ 398 return -EIO; 399} 400EXPORT_SYMBOL(fail_migrate_page); 401 402/* 403 * Common logic to directly migrate a single page suitable for 404 * pages that do not use PagePrivate. 405 * 406 * Pages are locked upon entry and exit. 407 */ 408int migrate_page(struct address_space *mapping, 409 struct page *newpage, struct page *page) 410{ 411 int rc; 412 413 BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 414 415 rc = migrate_page_move_mapping(mapping, newpage, page); 416 417 if (rc) 418 return rc; 419 420 migrate_page_copy(newpage, page); 421 return 0; 422} 423EXPORT_SYMBOL(migrate_page); 424 425#ifdef CONFIG_BLOCK 426/* 427 * Migration function for pages with buffers. This function can only be used 428 * if the underlying filesystem guarantees that no other references to "page" 429 * exist. 430 */ 431int buffer_migrate_page(struct address_space *mapping, 432 struct page *newpage, struct page *page) 433{ 434 struct buffer_head *bh, *head; 435 int rc; 436 437 if (!page_has_buffers(page)) 438 return migrate_page(mapping, newpage, page); 439 440 head = page_buffers(page); 441 442 rc = migrate_page_move_mapping(mapping, newpage, page); 443 444 if (rc) 445 return rc; 446 447 bh = head; 448 do { 449 get_bh(bh); 450 lock_buffer(bh); 451 bh = bh->b_this_page; 452 453 } while (bh != head); 454 455 ClearPagePrivate(page); 456 set_page_private(newpage, page_private(page)); 457 set_page_private(page, 0); 458 put_page(page); 459 get_page(newpage); 460 461 bh = head; 462 do { 463 set_bh_page(bh, newpage, bh_offset(bh)); 464 bh = bh->b_this_page; 465 466 } while (bh != head); 467 468 SetPagePrivate(newpage); 469 470 migrate_page_copy(newpage, page); 471 472 bh = head; 473 do { 474 unlock_buffer(bh); 475 put_bh(bh); 476 bh = bh->b_this_page; 477 478 } while (bh != head); 479 480 return 0; 481} 482EXPORT_SYMBOL(buffer_migrate_page); 483#endif 484 485/* 486 * Writeback a page to clean the dirty state 487 */ 488static int writeout(struct address_space *mapping, struct page *page) 489{ 490 struct writeback_control wbc = { 491 .sync_mode = WB_SYNC_NONE, 492 .nr_to_write = 1, 493 .range_start = 0, 494 .range_end = LLONG_MAX, 495 .nonblocking = 1, 496 .for_reclaim = 1 497 }; 498 int rc; 499 500 if (!mapping->a_ops->writepage) 501 /* No write method for the address space */ 502 return -EINVAL; 503 504 if (!clear_page_dirty_for_io(page)) 505 /* Someone else already triggered a write */ 506 return -EAGAIN; 507 508 /* 509 * A dirty page may imply that the underlying filesystem has 510 * the page on some queue. So the page must be clean for 511 * migration. Writeout may mean we loose the lock and the 512 * page state is no longer what we checked for earlier. 513 * At this point we know that the migration attempt cannot 514 * be successful. 515 */ 516 remove_migration_ptes(page, page); 517 518 rc = mapping->a_ops->writepage(page, &wbc); 519 if (rc < 0) 520 /* I/O Error writing */ 521 return -EIO; 522 523 if (rc != AOP_WRITEPAGE_ACTIVATE) 524 /* unlocked. Relock */ 525 lock_page(page); 526 527 return -EAGAIN; 528} 529 530/* 531 * Default handling if a filesystem does not provide a migration function. 532 */ 533static int fallback_migrate_page(struct address_space *mapping, 534 struct page *newpage, struct page *page) 535{ 536 if (PageDirty(page)) 537 return writeout(mapping, page); 538 539 /* 540 * Buffers may be managed in a filesystem specific way. 541 * We must have no buffers or drop them. 542 */ 543 if (PagePrivate(page) && 544 !try_to_release_page(page, GFP_KERNEL)) 545 return -EAGAIN; 546 547 return migrate_page(mapping, newpage, page); 548} 549 550/* 551 * Move a page to a newly allocated page 552 * The page is locked and all ptes have been successfully removed. 553 * 554 * The new page will have replaced the old page if this function 555 * is successful. 556 */ 557static int move_to_new_page(struct page *newpage, struct page *page) 558{ 559 struct address_space *mapping; 560 int rc; 561 562 /* 563 * Block others from accessing the page when we get around to 564 * establishing additional references. We are the only one 565 * holding a reference to the new page at this point. 566 */ 567 if (TestSetPageLocked(newpage)) 568 BUG(); 569 570 /* Prepare mapping for the new page.*/ 571 newpage->index = page->index; 572 newpage->mapping = page->mapping; 573 574 mapping = page_mapping(page); 575 if (!mapping) 576 rc = migrate_page(mapping, newpage, page); 577 else if (mapping->a_ops->migratepage) 578 /* 579 * Most pages have a mapping and most filesystems 580 * should provide a migration function. Anonymous 581 * pages are part of swap space which also has its 582 * own migration function. This is the most common 583 * path for page migration. 584 */ 585 rc = mapping->a_ops->migratepage(mapping, 586 newpage, page); 587 else 588 rc = fallback_migrate_page(mapping, newpage, page); 589 590 if (!rc) 591 remove_migration_ptes(page, newpage); 592 else 593 newpage->mapping = NULL; 594 595 unlock_page(newpage); 596 597 return rc; 598} 599 600/* 601 * Obtain the lock on page, remove all ptes and migrate the page 602 * to the newly allocated page in newpage. 603 */ 604static int unmap_and_move(new_page_t get_new_page, unsigned long private, 605 struct page *page, int force) 606{ 607 int rc = 0; 608 int *result = NULL; 609 struct page *newpage = get_new_page(page, private, &result); 610 int rcu_locked = 0; 611 612 if (!newpage) 613 return -ENOMEM; 614 615 if (page_count(page) == 1) 616 /* page was freed from under us. So we are done. */ 617 goto move_newpage; 618 619 rc = -EAGAIN; 620 if (TestSetPageLocked(page)) { 621 if (!force) 622 goto move_newpage; 623 lock_page(page); 624 } 625 626 if (PageWriteback(page)) { 627 if (!force) 628 goto unlock; 629 wait_on_page_writeback(page); 630 } 631 /* 632 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, 633 * we cannot notice that anon_vma is freed while we migrates a page. 634 * This rcu_read_lock() delays freeing anon_vma pointer until the end 635 * of migration. File cache pages are no problem because of page_lock() 636 * File Caches may use write_page() or lock_page() in migration, then, 637 * just care Anon page here. 638 */ 639 if (PageAnon(page)) { 640 rcu_read_lock(); 641 rcu_locked = 1; 642 } 643 /* 644 * This is a corner case handling. 645 * When a new swap-cache is read into, it is linked to LRU 646 * and treated as swapcache but has no rmap yet. 647 * Calling try_to_unmap() against a page->mapping==NULL page is 648 * BUG. So handle it here. 649 */ 650 if (!page->mapping) 651 goto rcu_unlock; 652 /* Establish migration ptes or remove ptes */ 653 try_to_unmap(page, 1); 654 655 if (!page_mapped(page)) 656 rc = move_to_new_page(newpage, page); 657 658 if (rc) 659 remove_migration_ptes(page, page); 660rcu_unlock: 661 if (rcu_locked) 662 rcu_read_unlock(); 663 664unlock: 665 666 unlock_page(page); 667 668 if (rc != -EAGAIN) { 669 /* 670 * A page that has been migrated has all references 671 * removed and will be freed. A page that has not been 672 * migrated will have kepts its references and be 673 * restored. 674 */ 675 list_del(&page->lru); 676 move_to_lru(page); 677 } 678 679move_newpage: 680 /* 681 * Move the new page to the LRU. If migration was not successful 682 * then this will free the page. 683 */ 684 move_to_lru(newpage); 685 if (result) { 686 if (rc) 687 *result = rc; 688 else 689 *result = page_to_nid(newpage); 690 } 691 return rc; 692} 693 694/* 695 * migrate_pages 696 * 697 * The function takes one list of pages to migrate and a function 698 * that determines from the page to be migrated and the private data 699 * the target of the move and allocates the page. 700 * 701 * The function returns after 10 attempts or if no pages 702 * are movable anymore because to has become empty 703 * or no retryable pages exist anymore. All pages will be 704 * returned to the LRU or freed. 705 * 706 * Return: Number of pages not migrated or error code. 707 */ 708int migrate_pages(struct list_head *from, 709 new_page_t get_new_page, unsigned long private) 710{ 711 int retry = 1; 712 int nr_failed = 0; 713 int pass = 0; 714 struct page *page; 715 struct page *page2; 716 int swapwrite = current->flags & PF_SWAPWRITE; 717 int rc; 718 719 if (!swapwrite) 720 current->flags |= PF_SWAPWRITE; 721 722 for(pass = 0; pass < 10 && retry; pass++) { 723 retry = 0; 724 725 list_for_each_entry_safe(page, page2, from, lru) { 726 cond_resched(); 727 728 rc = unmap_and_move(get_new_page, private, 729 page, pass > 2); 730 731 switch(rc) { 732 case -ENOMEM: 733 goto out; 734 case -EAGAIN: 735 retry++; 736 break; 737 case 0: 738 break; 739 default: 740 /* Permanent failure */ 741 nr_failed++; 742 break; 743 } 744 } 745 } 746 rc = 0; 747out: 748 if (!swapwrite) 749 current->flags &= ~PF_SWAPWRITE; 750 751 putback_lru_pages(from); 752 753 if (rc) 754 return rc; 755 756 return nr_failed + retry; 757} 758 759#ifdef CONFIG_NUMA 760/* 761 * Move a list of individual pages 762 */ 763struct page_to_node { 764 unsigned long addr; 765 struct page *page; 766 int node; 767 int status; 768}; 769 770static struct page *new_page_node(struct page *p, unsigned long private, 771 int **result) 772{ 773 struct page_to_node *pm = (struct page_to_node *)private; 774 775 while (pm->node != MAX_NUMNODES && pm->page != p) 776 pm++; 777 778 if (pm->node == MAX_NUMNODES) 779 return NULL; 780 781 *result = &pm->status; 782 783 return alloc_pages_node(pm->node, 784 GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); 785} 786 787/* 788 * Move a set of pages as indicated in the pm array. The addr 789 * field must be set to the virtual address of the page to be moved 790 * and the node number must contain a valid target node. 791 */ 792static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, 793 int migrate_all) 794{ 795 int err; 796 struct page_to_node *pp; 797 LIST_HEAD(pagelist); 798 799 down_read(&mm->mmap_sem); 800 801 /* 802 * Build a list of pages to migrate 803 */ 804 migrate_prep(); 805 for (pp = pm; pp->node != MAX_NUMNODES; pp++) { 806 struct vm_area_struct *vma; 807 struct page *page; 808 809 /* 810 * A valid page pointer that will not match any of the 811 * pages that will be moved. 812 */ 813 pp->page = ZERO_PAGE(0); 814 815 err = -EFAULT; 816 vma = find_vma(mm, pp->addr); 817 if (!vma || !vma_migratable(vma)) 818 goto set_status; 819 820 page = follow_page(vma, pp->addr, FOLL_GET); 821 err = -ENOENT; 822 if (!page) 823 goto set_status; 824 825 if (PageReserved(page)) /* Check for zero page */ 826 goto put_and_set; 827 828 pp->page = page; 829 err = page_to_nid(page); 830 831 if (err == pp->node) 832 /* 833 * Node already in the right place 834 */ 835 goto put_and_set; 836 837 err = -EACCES; 838 if (page_mapcount(page) > 1 && 839 !migrate_all) 840 goto put_and_set; 841 842 err = isolate_lru_page(page, &pagelist); 843put_and_set: 844 /* 845 * Either remove the duplicate refcount from 846 * isolate_lru_page() or drop the page ref if it was 847 * not isolated. 848 */ 849 put_page(page); 850set_status: 851 pp->status = err; 852 } 853 854 if (!list_empty(&pagelist)) 855 err = migrate_pages(&pagelist, new_page_node, 856 (unsigned long)pm); 857 else 858 err = -ENOENT; 859 860 up_read(&mm->mmap_sem); 861 return err; 862} 863 864/* 865 * Determine the nodes of a list of pages. The addr in the pm array 866 * must have been set to the virtual address of which we want to determine 867 * the node number. 868 */ 869static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) 870{ 871 down_read(&mm->mmap_sem); 872 873 for ( ; pm->node != MAX_NUMNODES; pm++) { 874 struct vm_area_struct *vma; 875 struct page *page; 876 int err; 877 878 err = -EFAULT; 879 vma = find_vma(mm, pm->addr); 880 if (!vma) 881 goto set_status; 882 883 page = follow_page(vma, pm->addr, 0); 884 err = -ENOENT; 885 /* Use PageReserved to check for zero page */ 886 if (!page || PageReserved(page)) 887 goto set_status; 888 889 err = page_to_nid(page); 890set_status: 891 pm->status = err; 892 } 893 894 up_read(&mm->mmap_sem); 895 return 0; 896} 897 898/* 899 * Move a list of pages in the address space of the currently executing 900 * process. 901 */ 902asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, 903 const void __user * __user *pages, 904 const int __user *nodes, 905 int __user *status, int flags) 906{ 907 int err = 0; 908 int i; 909 struct task_struct *task; 910 nodemask_t task_nodes; 911 struct mm_struct *mm; 912 struct page_to_node *pm = NULL; 913 914 /* Check flags */ 915 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) 916 return -EINVAL; 917 918 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 919 return -EPERM; 920 921 /* Find the mm_struct */ 922 read_lock(&tasklist_lock); 923 task = pid ? find_task_by_vpid(pid) : current; 924 if (!task) { 925 read_unlock(&tasklist_lock); 926 return -ESRCH; 927 } 928 mm = get_task_mm(task); 929 read_unlock(&tasklist_lock); 930 931 if (!mm) 932 return -EINVAL; 933 934 /* 935 * Check if this process has the right to modify the specified 936 * process. The right exists if the process has administrative 937 * capabilities, superuser privileges or the same 938 * userid as the target process. 939 */ 940 if ((current->euid != task->suid) && (current->euid != task->uid) && 941 (current->uid != task->suid) && (current->uid != task->uid) && 942 !capable(CAP_SYS_NICE)) { 943 err = -EPERM; 944 goto out2; 945 } 946 947 err = security_task_movememory(task); 948 if (err) 949 goto out2; 950 951 952 task_nodes = cpuset_mems_allowed(task); 953 954 /* Limit nr_pages so that the multiplication may not overflow */ 955 if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { 956 err = -E2BIG; 957 goto out2; 958 } 959 960 pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); 961 if (!pm) { 962 err = -ENOMEM; 963 goto out2; 964 } 965 966 /* 967 * Get parameters from user space and initialize the pm 968 * array. Return various errors if the user did something wrong. 969 */ 970 for (i = 0; i < nr_pages; i++) { 971 const void __user *p; 972 973 err = -EFAULT; 974 if (get_user(p, pages + i)) 975 goto out; 976 977 pm[i].addr = (unsigned long)p; 978 if (nodes) { 979 int node; 980 981 if (get_user(node, nodes + i)) 982 goto out; 983 984 err = -ENODEV; 985 if (!node_state(node, N_HIGH_MEMORY)) 986 goto out; 987 988 err = -EACCES; 989 if (!node_isset(node, task_nodes)) 990 goto out; 991 992 pm[i].node = node; 993 } else 994 pm[i].node = 0; /* anything to not match MAX_NUMNODES */ 995 } 996 /* End marker */ 997 pm[nr_pages].node = MAX_NUMNODES; 998 999 if (nodes) 1000 err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL); 1001 else 1002 err = do_pages_stat(mm, pm); 1003 1004 if (err >= 0) 1005 /* Return status information */ 1006 for (i = 0; i < nr_pages; i++) 1007 if (put_user(pm[i].status, status + i)) 1008 err = -EFAULT; 1009 1010out: 1011 vfree(pm); 1012out2: 1013 mmput(mm); 1014 return err; 1015} 1016#endif 1017 1018/* 1019 * Call migration functions in the vma_ops that may prepare 1020 * memory in a vm for migration. migration functions may perform 1021 * the migration for vmas that do not have an underlying page struct. 1022 */ 1023int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, 1024 const nodemask_t *from, unsigned long flags) 1025{ 1026 struct vm_area_struct *vma; 1027 int err = 0; 1028 1029 for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) { 1030 if (vma->vm_ops && vma->vm_ops->migrate) { 1031 err = vma->vm_ops->migrate(vma, to, from, flags); 1032 if (err) 1033 break; 1034 } 1035 } 1036 return err; 1037} 1038