rmap.c revision 8a9f3ccd24741b50200c3f33d62534c7271f3dfc
1/* 2 * mm/rmap.c - physical to virtual reverse mappings 3 * 4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br> 5 * Released under the General Public License (GPL). 6 * 7 * Simple, low overhead reverse mapping scheme. 8 * Please try to keep this thing as modular as possible. 9 * 10 * Provides methods for unmapping each kind of mapped page: 11 * the anon methods track anonymous pages, and 12 * the file methods track pages belonging to an inode. 13 * 14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001 15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 17 * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004 18 */ 19 20/* 21 * Lock ordering in mm: 22 * 23 * inode->i_mutex (while writing or truncating, not reading or faulting) 24 * inode->i_alloc_sem (vmtruncate_range) 25 * mm->mmap_sem 26 * page->flags PG_locked (lock_page) 27 * mapping->i_mmap_lock 28 * anon_vma->lock 29 * mm->page_table_lock or pte_lock 30 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 31 * swap_lock (in swap_duplicate, swap_info_get) 32 * mmlist_lock (in mmput, drain_mmlist and others) 33 * mapping->private_lock (in __set_page_dirty_buffers) 34 * inode_lock (in set_page_dirty's __mark_inode_dirty) 35 * sb_lock (within inode_lock in fs/fs-writeback.c) 36 * mapping->tree_lock (widely used, in set_page_dirty, 37 * in arch-dependent flush_dcache_mmap_lock, 38 * within inode_lock in __sync_single_inode) 39 */ 40 41#include <linux/mm.h> 42#include <linux/pagemap.h> 43#include <linux/swap.h> 44#include <linux/swapops.h> 45#include <linux/slab.h> 46#include <linux/init.h> 47#include <linux/rmap.h> 48#include <linux/rcupdate.h> 49#include <linux/module.h> 50#include <linux/kallsyms.h> 51#include <linux/memcontrol.h> 52 53#include <asm/tlbflush.h> 54 55struct kmem_cache *anon_vma_cachep; 56 57/* This must be called under the mmap_sem. */ 58int anon_vma_prepare(struct vm_area_struct *vma) 59{ 60 struct anon_vma *anon_vma = vma->anon_vma; 61 62 might_sleep(); 63 if (unlikely(!anon_vma)) { 64 struct mm_struct *mm = vma->vm_mm; 65 struct anon_vma *allocated, *locked; 66 67 anon_vma = find_mergeable_anon_vma(vma); 68 if (anon_vma) { 69 allocated = NULL; 70 locked = anon_vma; 71 spin_lock(&locked->lock); 72 } else { 73 anon_vma = anon_vma_alloc(); 74 if (unlikely(!anon_vma)) 75 return -ENOMEM; 76 allocated = anon_vma; 77 locked = NULL; 78 } 79 80 /* page_table_lock to protect against threads */ 81 spin_lock(&mm->page_table_lock); 82 if (likely(!vma->anon_vma)) { 83 vma->anon_vma = anon_vma; 84 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 85 allocated = NULL; 86 } 87 spin_unlock(&mm->page_table_lock); 88 89 if (locked) 90 spin_unlock(&locked->lock); 91 if (unlikely(allocated)) 92 anon_vma_free(allocated); 93 } 94 return 0; 95} 96 97void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) 98{ 99 BUG_ON(vma->anon_vma != next->anon_vma); 100 list_del(&next->anon_vma_node); 101} 102 103void __anon_vma_link(struct vm_area_struct *vma) 104{ 105 struct anon_vma *anon_vma = vma->anon_vma; 106 107 if (anon_vma) 108 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 109} 110 111void anon_vma_link(struct vm_area_struct *vma) 112{ 113 struct anon_vma *anon_vma = vma->anon_vma; 114 115 if (anon_vma) { 116 spin_lock(&anon_vma->lock); 117 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 118 spin_unlock(&anon_vma->lock); 119 } 120} 121 122void anon_vma_unlink(struct vm_area_struct *vma) 123{ 124 struct anon_vma *anon_vma = vma->anon_vma; 125 int empty; 126 127 if (!anon_vma) 128 return; 129 130 spin_lock(&anon_vma->lock); 131 list_del(&vma->anon_vma_node); 132 133 /* We must garbage collect the anon_vma if it's empty */ 134 empty = list_empty(&anon_vma->head); 135 spin_unlock(&anon_vma->lock); 136 137 if (empty) 138 anon_vma_free(anon_vma); 139} 140 141static void anon_vma_ctor(struct kmem_cache *cachep, void *data) 142{ 143 struct anon_vma *anon_vma = data; 144 145 spin_lock_init(&anon_vma->lock); 146 INIT_LIST_HEAD(&anon_vma->head); 147} 148 149void __init anon_vma_init(void) 150{ 151 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 152 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); 153} 154 155/* 156 * Getting a lock on a stable anon_vma from a page off the LRU is 157 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 158 */ 159static struct anon_vma *page_lock_anon_vma(struct page *page) 160{ 161 struct anon_vma *anon_vma; 162 unsigned long anon_mapping; 163 164 rcu_read_lock(); 165 anon_mapping = (unsigned long) page->mapping; 166 if (!(anon_mapping & PAGE_MAPPING_ANON)) 167 goto out; 168 if (!page_mapped(page)) 169 goto out; 170 171 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 172 spin_lock(&anon_vma->lock); 173 return anon_vma; 174out: 175 rcu_read_unlock(); 176 return NULL; 177} 178 179static void page_unlock_anon_vma(struct anon_vma *anon_vma) 180{ 181 spin_unlock(&anon_vma->lock); 182 rcu_read_unlock(); 183} 184 185/* 186 * At what user virtual address is page expected in @vma? 187 * Returns virtual address or -EFAULT if page's index/offset is not 188 * within the range mapped the @vma. 189 */ 190static inline unsigned long 191vma_address(struct page *page, struct vm_area_struct *vma) 192{ 193 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 194 unsigned long address; 195 196 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 197 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 198 /* page should be within @vma mapping range */ 199 return -EFAULT; 200 } 201 return address; 202} 203 204/* 205 * At what user virtual address is page expected in vma? checking that the 206 * page matches the vma: currently only used on anon pages, by unuse_vma; 207 */ 208unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 209{ 210 if (PageAnon(page)) { 211 if ((void *)vma->anon_vma != 212 (void *)page->mapping - PAGE_MAPPING_ANON) 213 return -EFAULT; 214 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 215 if (!vma->vm_file || 216 vma->vm_file->f_mapping != page->mapping) 217 return -EFAULT; 218 } else 219 return -EFAULT; 220 return vma_address(page, vma); 221} 222 223/* 224 * Check that @page is mapped at @address into @mm. 225 * 226 * On success returns with pte mapped and locked. 227 */ 228pte_t *page_check_address(struct page *page, struct mm_struct *mm, 229 unsigned long address, spinlock_t **ptlp) 230{ 231 pgd_t *pgd; 232 pud_t *pud; 233 pmd_t *pmd; 234 pte_t *pte; 235 spinlock_t *ptl; 236 237 pgd = pgd_offset(mm, address); 238 if (!pgd_present(*pgd)) 239 return NULL; 240 241 pud = pud_offset(pgd, address); 242 if (!pud_present(*pud)) 243 return NULL; 244 245 pmd = pmd_offset(pud, address); 246 if (!pmd_present(*pmd)) 247 return NULL; 248 249 pte = pte_offset_map(pmd, address); 250 /* Make a quick check before getting the lock */ 251 if (!pte_present(*pte)) { 252 pte_unmap(pte); 253 return NULL; 254 } 255 256 ptl = pte_lockptr(mm, pmd); 257 spin_lock(ptl); 258 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { 259 *ptlp = ptl; 260 return pte; 261 } 262 pte_unmap_unlock(pte, ptl); 263 return NULL; 264} 265 266/* 267 * Subfunctions of page_referenced: page_referenced_one called 268 * repeatedly from either page_referenced_anon or page_referenced_file. 269 */ 270static int page_referenced_one(struct page *page, 271 struct vm_area_struct *vma, unsigned int *mapcount) 272{ 273 struct mm_struct *mm = vma->vm_mm; 274 unsigned long address; 275 pte_t *pte; 276 spinlock_t *ptl; 277 int referenced = 0; 278 279 address = vma_address(page, vma); 280 if (address == -EFAULT) 281 goto out; 282 283 pte = page_check_address(page, mm, address, &ptl); 284 if (!pte) 285 goto out; 286 287 if (vma->vm_flags & VM_LOCKED) { 288 referenced++; 289 *mapcount = 1; /* break early from loop */ 290 } else if (ptep_clear_flush_young(vma, address, pte)) 291 referenced++; 292 293 /* Pretend the page is referenced if the task has the 294 swap token and is in the middle of a page fault. */ 295 if (mm != current->mm && has_swap_token(mm) && 296 rwsem_is_locked(&mm->mmap_sem)) 297 referenced++; 298 299 (*mapcount)--; 300 pte_unmap_unlock(pte, ptl); 301out: 302 return referenced; 303} 304 305static int page_referenced_anon(struct page *page) 306{ 307 unsigned int mapcount; 308 struct anon_vma *anon_vma; 309 struct vm_area_struct *vma; 310 int referenced = 0; 311 312 anon_vma = page_lock_anon_vma(page); 313 if (!anon_vma) 314 return referenced; 315 316 mapcount = page_mapcount(page); 317 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 318 referenced += page_referenced_one(page, vma, &mapcount); 319 if (!mapcount) 320 break; 321 } 322 323 page_unlock_anon_vma(anon_vma); 324 return referenced; 325} 326 327/** 328 * page_referenced_file - referenced check for object-based rmap 329 * @page: the page we're checking references on. 330 * 331 * For an object-based mapped page, find all the places it is mapped and 332 * check/clear the referenced flag. This is done by following the page->mapping 333 * pointer, then walking the chain of vmas it holds. It returns the number 334 * of references it found. 335 * 336 * This function is only called from page_referenced for object-based pages. 337 */ 338static int page_referenced_file(struct page *page) 339{ 340 unsigned int mapcount; 341 struct address_space *mapping = page->mapping; 342 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 343 struct vm_area_struct *vma; 344 struct prio_tree_iter iter; 345 int referenced = 0; 346 347 /* 348 * The caller's checks on page->mapping and !PageAnon have made 349 * sure that this is a file page: the check for page->mapping 350 * excludes the case just before it gets set on an anon page. 351 */ 352 BUG_ON(PageAnon(page)); 353 354 /* 355 * The page lock not only makes sure that page->mapping cannot 356 * suddenly be NULLified by truncation, it makes sure that the 357 * structure at mapping cannot be freed and reused yet, 358 * so we can safely take mapping->i_mmap_lock. 359 */ 360 BUG_ON(!PageLocked(page)); 361 362 spin_lock(&mapping->i_mmap_lock); 363 364 /* 365 * i_mmap_lock does not stabilize mapcount at all, but mapcount 366 * is more likely to be accurate if we note it after spinning. 367 */ 368 mapcount = page_mapcount(page); 369 370 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 371 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) 372 == (VM_LOCKED|VM_MAYSHARE)) { 373 referenced++; 374 break; 375 } 376 referenced += page_referenced_one(page, vma, &mapcount); 377 if (!mapcount) 378 break; 379 } 380 381 spin_unlock(&mapping->i_mmap_lock); 382 return referenced; 383} 384 385/** 386 * page_referenced - test if the page was referenced 387 * @page: the page to test 388 * @is_locked: caller holds lock on the page 389 * 390 * Quick test_and_clear_referenced for all mappings to a page, 391 * returns the number of ptes which referenced the page. 392 */ 393int page_referenced(struct page *page, int is_locked) 394{ 395 int referenced = 0; 396 397 if (page_test_and_clear_young(page)) 398 referenced++; 399 400 if (TestClearPageReferenced(page)) 401 referenced++; 402 403 if (page_mapped(page) && page->mapping) { 404 if (PageAnon(page)) 405 referenced += page_referenced_anon(page); 406 else if (is_locked) 407 referenced += page_referenced_file(page); 408 else if (TestSetPageLocked(page)) 409 referenced++; 410 else { 411 if (page->mapping) 412 referenced += page_referenced_file(page); 413 unlock_page(page); 414 } 415 } 416 return referenced; 417} 418 419static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) 420{ 421 struct mm_struct *mm = vma->vm_mm; 422 unsigned long address; 423 pte_t *pte; 424 spinlock_t *ptl; 425 int ret = 0; 426 427 address = vma_address(page, vma); 428 if (address == -EFAULT) 429 goto out; 430 431 pte = page_check_address(page, mm, address, &ptl); 432 if (!pte) 433 goto out; 434 435 if (pte_dirty(*pte) || pte_write(*pte)) { 436 pte_t entry; 437 438 flush_cache_page(vma, address, pte_pfn(*pte)); 439 entry = ptep_clear_flush(vma, address, pte); 440 entry = pte_wrprotect(entry); 441 entry = pte_mkclean(entry); 442 set_pte_at(mm, address, pte, entry); 443 ret = 1; 444 } 445 446 pte_unmap_unlock(pte, ptl); 447out: 448 return ret; 449} 450 451static int page_mkclean_file(struct address_space *mapping, struct page *page) 452{ 453 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 454 struct vm_area_struct *vma; 455 struct prio_tree_iter iter; 456 int ret = 0; 457 458 BUG_ON(PageAnon(page)); 459 460 spin_lock(&mapping->i_mmap_lock); 461 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 462 if (vma->vm_flags & VM_SHARED) 463 ret += page_mkclean_one(page, vma); 464 } 465 spin_unlock(&mapping->i_mmap_lock); 466 return ret; 467} 468 469int page_mkclean(struct page *page) 470{ 471 int ret = 0; 472 473 BUG_ON(!PageLocked(page)); 474 475 if (page_mapped(page)) { 476 struct address_space *mapping = page_mapping(page); 477 if (mapping) { 478 ret = page_mkclean_file(mapping, page); 479 if (page_test_dirty(page)) { 480 page_clear_dirty(page); 481 ret = 1; 482 } 483 } 484 } 485 486 return ret; 487} 488EXPORT_SYMBOL_GPL(page_mkclean); 489 490/** 491 * page_set_anon_rmap - setup new anonymous rmap 492 * @page: the page to add the mapping to 493 * @vma: the vm area in which the mapping is added 494 * @address: the user virtual address mapped 495 */ 496static void __page_set_anon_rmap(struct page *page, 497 struct vm_area_struct *vma, unsigned long address) 498{ 499 struct anon_vma *anon_vma = vma->anon_vma; 500 501 BUG_ON(!anon_vma); 502 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 503 page->mapping = (struct address_space *) anon_vma; 504 505 page->index = linear_page_index(vma, address); 506 507 /* 508 * nr_mapped state can be updated without turning off 509 * interrupts because it is not modified via interrupt. 510 */ 511 __inc_zone_page_state(page, NR_ANON_PAGES); 512} 513 514/** 515 * page_set_anon_rmap - sanity check anonymous rmap addition 516 * @page: the page to add the mapping to 517 * @vma: the vm area in which the mapping is added 518 * @address: the user virtual address mapped 519 */ 520static void __page_check_anon_rmap(struct page *page, 521 struct vm_area_struct *vma, unsigned long address) 522{ 523#ifdef CONFIG_DEBUG_VM 524 /* 525 * The page's anon-rmap details (mapping and index) are guaranteed to 526 * be set up correctly at this point. 527 * 528 * We have exclusion against page_add_anon_rmap because the caller 529 * always holds the page locked, except if called from page_dup_rmap, 530 * in which case the page is already known to be setup. 531 * 532 * We have exclusion against page_add_new_anon_rmap because those pages 533 * are initially only visible via the pagetables, and the pte is locked 534 * over the call to page_add_new_anon_rmap. 535 */ 536 struct anon_vma *anon_vma = vma->anon_vma; 537 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 538 BUG_ON(page->mapping != (struct address_space *)anon_vma); 539 BUG_ON(page->index != linear_page_index(vma, address)); 540#endif 541} 542 543/** 544 * page_add_anon_rmap - add pte mapping to an anonymous page 545 * @page: the page to add the mapping to 546 * @vma: the vm area in which the mapping is added 547 * @address: the user virtual address mapped 548 * 549 * The caller needs to hold the pte lock and the page must be locked. 550 */ 551void page_add_anon_rmap(struct page *page, 552 struct vm_area_struct *vma, unsigned long address) 553{ 554 VM_BUG_ON(!PageLocked(page)); 555 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 556 if (atomic_inc_and_test(&page->_mapcount)) 557 __page_set_anon_rmap(page, vma, address); 558 else { 559 __page_check_anon_rmap(page, vma, address); 560 /* 561 * We unconditionally charged during prepare, we uncharge here 562 * This takes care of balancing the reference counts 563 */ 564 mem_cgroup_uncharge_page(page); 565 } 566} 567 568/* 569 * page_add_new_anon_rmap - add pte mapping to a new anonymous page 570 * @page: the page to add the mapping to 571 * @vma: the vm area in which the mapping is added 572 * @address: the user virtual address mapped 573 * 574 * Same as page_add_anon_rmap but must only be called on *new* pages. 575 * This means the inc-and-test can be bypassed. 576 * Page does not have to be locked. 577 */ 578void page_add_new_anon_rmap(struct page *page, 579 struct vm_area_struct *vma, unsigned long address) 580{ 581 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 582 atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ 583 __page_set_anon_rmap(page, vma, address); 584} 585 586/** 587 * page_add_file_rmap - add pte mapping to a file page 588 * @page: the page to add the mapping to 589 * 590 * The caller needs to hold the pte lock. 591 */ 592void page_add_file_rmap(struct page *page) 593{ 594 if (atomic_inc_and_test(&page->_mapcount)) 595 __inc_zone_page_state(page, NR_FILE_MAPPED); 596 else 597 /* 598 * We unconditionally charged during prepare, we uncharge here 599 * This takes care of balancing the reference counts 600 */ 601 mem_cgroup_uncharge_page(page); 602} 603 604#ifdef CONFIG_DEBUG_VM 605/** 606 * page_dup_rmap - duplicate pte mapping to a page 607 * @page: the page to add the mapping to 608 * 609 * For copy_page_range only: minimal extract from page_add_file_rmap / 610 * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's 611 * quicker. 612 * 613 * The caller needs to hold the pte lock. 614 */ 615void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) 616{ 617 BUG_ON(page_mapcount(page) == 0); 618 if (PageAnon(page)) 619 __page_check_anon_rmap(page, vma, address); 620 atomic_inc(&page->_mapcount); 621} 622#endif 623 624/** 625 * page_remove_rmap - take down pte mapping from a page 626 * @page: page to remove mapping from 627 * 628 * The caller needs to hold the pte lock. 629 */ 630void page_remove_rmap(struct page *page, struct vm_area_struct *vma) 631{ 632 if (atomic_add_negative(-1, &page->_mapcount)) { 633 if (unlikely(page_mapcount(page) < 0)) { 634 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); 635 printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page)); 636 printk (KERN_EMERG " page->flags = %lx\n", page->flags); 637 printk (KERN_EMERG " page->count = %x\n", page_count(page)); 638 printk (KERN_EMERG " page->mapping = %p\n", page->mapping); 639 print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); 640 if (vma->vm_ops) { 641 print_symbol (KERN_EMERG " vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage); 642 print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault); 643 } 644 if (vma->vm_file && vma->vm_file->f_op) 645 print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap); 646 BUG(); 647 } 648 649 /* 650 * It would be tidy to reset the PageAnon mapping here, 651 * but that might overwrite a racing page_add_anon_rmap 652 * which increments mapcount after us but sets mapping 653 * before us: so leave the reset to free_hot_cold_page, 654 * and remember that it's only reliable while mapped. 655 * Leaving it set also helps swapoff to reinstate ptes 656 * faster for those pages still in swapcache. 657 */ 658 if (page_test_dirty(page)) { 659 page_clear_dirty(page); 660 set_page_dirty(page); 661 } 662 mem_cgroup_uncharge_page(page); 663 664 __dec_zone_page_state(page, 665 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); 666 } 667} 668 669/* 670 * Subfunctions of try_to_unmap: try_to_unmap_one called 671 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 672 */ 673static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 674 int migration) 675{ 676 struct mm_struct *mm = vma->vm_mm; 677 unsigned long address; 678 pte_t *pte; 679 pte_t pteval; 680 spinlock_t *ptl; 681 int ret = SWAP_AGAIN; 682 683 address = vma_address(page, vma); 684 if (address == -EFAULT) 685 goto out; 686 687 pte = page_check_address(page, mm, address, &ptl); 688 if (!pte) 689 goto out; 690 691 /* 692 * If the page is mlock()d, we cannot swap it out. 693 * If it's recently referenced (perhaps page_referenced 694 * skipped over this mm) then we should reactivate it. 695 */ 696 if (!migration && ((vma->vm_flags & VM_LOCKED) || 697 (ptep_clear_flush_young(vma, address, pte)))) { 698 ret = SWAP_FAIL; 699 goto out_unmap; 700 } 701 702 /* Nuke the page table entry. */ 703 flush_cache_page(vma, address, page_to_pfn(page)); 704 pteval = ptep_clear_flush(vma, address, pte); 705 706 /* Move the dirty bit to the physical page now the pte is gone. */ 707 if (pte_dirty(pteval)) 708 set_page_dirty(page); 709 710 /* Update high watermark before we lower rss */ 711 update_hiwater_rss(mm); 712 713 if (PageAnon(page)) { 714 swp_entry_t entry = { .val = page_private(page) }; 715 716 if (PageSwapCache(page)) { 717 /* 718 * Store the swap location in the pte. 719 * See handle_pte_fault() ... 720 */ 721 swap_duplicate(entry); 722 if (list_empty(&mm->mmlist)) { 723 spin_lock(&mmlist_lock); 724 if (list_empty(&mm->mmlist)) 725 list_add(&mm->mmlist, &init_mm.mmlist); 726 spin_unlock(&mmlist_lock); 727 } 728 dec_mm_counter(mm, anon_rss); 729#ifdef CONFIG_MIGRATION 730 } else { 731 /* 732 * Store the pfn of the page in a special migration 733 * pte. do_swap_page() will wait until the migration 734 * pte is removed and then restart fault handling. 735 */ 736 BUG_ON(!migration); 737 entry = make_migration_entry(page, pte_write(pteval)); 738#endif 739 } 740 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 741 BUG_ON(pte_file(*pte)); 742 } else 743#ifdef CONFIG_MIGRATION 744 if (migration) { 745 /* Establish migration entry for a file page */ 746 swp_entry_t entry; 747 entry = make_migration_entry(page, pte_write(pteval)); 748 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 749 } else 750#endif 751 dec_mm_counter(mm, file_rss); 752 753 754 page_remove_rmap(page, vma); 755 page_cache_release(page); 756 757out_unmap: 758 pte_unmap_unlock(pte, ptl); 759out: 760 return ret; 761} 762 763/* 764 * objrmap doesn't work for nonlinear VMAs because the assumption that 765 * offset-into-file correlates with offset-into-virtual-addresses does not hold. 766 * Consequently, given a particular page and its ->index, we cannot locate the 767 * ptes which are mapping that page without an exhaustive linear search. 768 * 769 * So what this code does is a mini "virtual scan" of each nonlinear VMA which 770 * maps the file to which the target page belongs. The ->vm_private_data field 771 * holds the current cursor into that scan. Successive searches will circulate 772 * around the vma's virtual address space. 773 * 774 * So as more replacement pressure is applied to the pages in a nonlinear VMA, 775 * more scanning pressure is placed against them as well. Eventually pages 776 * will become fully unmapped and are eligible for eviction. 777 * 778 * For very sparsely populated VMAs this is a little inefficient - chances are 779 * there there won't be many ptes located within the scan cluster. In this case 780 * maybe we could scan further - to the end of the pte page, perhaps. 781 */ 782#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) 783#define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) 784 785static void try_to_unmap_cluster(unsigned long cursor, 786 unsigned int *mapcount, struct vm_area_struct *vma) 787{ 788 struct mm_struct *mm = vma->vm_mm; 789 pgd_t *pgd; 790 pud_t *pud; 791 pmd_t *pmd; 792 pte_t *pte; 793 pte_t pteval; 794 spinlock_t *ptl; 795 struct page *page; 796 unsigned long address; 797 unsigned long end; 798 799 address = (vma->vm_start + cursor) & CLUSTER_MASK; 800 end = address + CLUSTER_SIZE; 801 if (address < vma->vm_start) 802 address = vma->vm_start; 803 if (end > vma->vm_end) 804 end = vma->vm_end; 805 806 pgd = pgd_offset(mm, address); 807 if (!pgd_present(*pgd)) 808 return; 809 810 pud = pud_offset(pgd, address); 811 if (!pud_present(*pud)) 812 return; 813 814 pmd = pmd_offset(pud, address); 815 if (!pmd_present(*pmd)) 816 return; 817 818 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 819 820 /* Update high watermark before we lower rss */ 821 update_hiwater_rss(mm); 822 823 for (; address < end; pte++, address += PAGE_SIZE) { 824 if (!pte_present(*pte)) 825 continue; 826 page = vm_normal_page(vma, address, *pte); 827 BUG_ON(!page || PageAnon(page)); 828 829 if (ptep_clear_flush_young(vma, address, pte)) 830 continue; 831 832 /* Nuke the page table entry. */ 833 flush_cache_page(vma, address, pte_pfn(*pte)); 834 pteval = ptep_clear_flush(vma, address, pte); 835 836 /* If nonlinear, store the file page offset in the pte. */ 837 if (page->index != linear_page_index(vma, address)) 838 set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); 839 840 /* Move the dirty bit to the physical page now the pte is gone. */ 841 if (pte_dirty(pteval)) 842 set_page_dirty(page); 843 844 page_remove_rmap(page, vma); 845 page_cache_release(page); 846 dec_mm_counter(mm, file_rss); 847 (*mapcount)--; 848 } 849 pte_unmap_unlock(pte - 1, ptl); 850} 851 852static int try_to_unmap_anon(struct page *page, int migration) 853{ 854 struct anon_vma *anon_vma; 855 struct vm_area_struct *vma; 856 int ret = SWAP_AGAIN; 857 858 anon_vma = page_lock_anon_vma(page); 859 if (!anon_vma) 860 return ret; 861 862 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 863 ret = try_to_unmap_one(page, vma, migration); 864 if (ret == SWAP_FAIL || !page_mapped(page)) 865 break; 866 } 867 868 page_unlock_anon_vma(anon_vma); 869 return ret; 870} 871 872/** 873 * try_to_unmap_file - unmap file page using the object-based rmap method 874 * @page: the page to unmap 875 * 876 * Find all the mappings of a page using the mapping pointer and the vma chains 877 * contained in the address_space struct it points to. 878 * 879 * This function is only called from try_to_unmap for object-based pages. 880 */ 881static int try_to_unmap_file(struct page *page, int migration) 882{ 883 struct address_space *mapping = page->mapping; 884 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 885 struct vm_area_struct *vma; 886 struct prio_tree_iter iter; 887 int ret = SWAP_AGAIN; 888 unsigned long cursor; 889 unsigned long max_nl_cursor = 0; 890 unsigned long max_nl_size = 0; 891 unsigned int mapcount; 892 893 spin_lock(&mapping->i_mmap_lock); 894 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 895 ret = try_to_unmap_one(page, vma, migration); 896 if (ret == SWAP_FAIL || !page_mapped(page)) 897 goto out; 898 } 899 900 if (list_empty(&mapping->i_mmap_nonlinear)) 901 goto out; 902 903 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 904 shared.vm_set.list) { 905 if ((vma->vm_flags & VM_LOCKED) && !migration) 906 continue; 907 cursor = (unsigned long) vma->vm_private_data; 908 if (cursor > max_nl_cursor) 909 max_nl_cursor = cursor; 910 cursor = vma->vm_end - vma->vm_start; 911 if (cursor > max_nl_size) 912 max_nl_size = cursor; 913 } 914 915 if (max_nl_size == 0) { /* any nonlinears locked or reserved */ 916 ret = SWAP_FAIL; 917 goto out; 918 } 919 920 /* 921 * We don't try to search for this page in the nonlinear vmas, 922 * and page_referenced wouldn't have found it anyway. Instead 923 * just walk the nonlinear vmas trying to age and unmap some. 924 * The mapcount of the page we came in with is irrelevant, 925 * but even so use it as a guide to how hard we should try? 926 */ 927 mapcount = page_mapcount(page); 928 if (!mapcount) 929 goto out; 930 cond_resched_lock(&mapping->i_mmap_lock); 931 932 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; 933 if (max_nl_cursor == 0) 934 max_nl_cursor = CLUSTER_SIZE; 935 936 do { 937 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 938 shared.vm_set.list) { 939 if ((vma->vm_flags & VM_LOCKED) && !migration) 940 continue; 941 cursor = (unsigned long) vma->vm_private_data; 942 while ( cursor < max_nl_cursor && 943 cursor < vma->vm_end - vma->vm_start) { 944 try_to_unmap_cluster(cursor, &mapcount, vma); 945 cursor += CLUSTER_SIZE; 946 vma->vm_private_data = (void *) cursor; 947 if ((int)mapcount <= 0) 948 goto out; 949 } 950 vma->vm_private_data = (void *) max_nl_cursor; 951 } 952 cond_resched_lock(&mapping->i_mmap_lock); 953 max_nl_cursor += CLUSTER_SIZE; 954 } while (max_nl_cursor <= max_nl_size); 955 956 /* 957 * Don't loop forever (perhaps all the remaining pages are 958 * in locked vmas). Reset cursor on all unreserved nonlinear 959 * vmas, now forgetting on which ones it had fallen behind. 960 */ 961 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) 962 vma->vm_private_data = NULL; 963out: 964 spin_unlock(&mapping->i_mmap_lock); 965 return ret; 966} 967 968/** 969 * try_to_unmap - try to remove all page table mappings to a page 970 * @page: the page to get unmapped 971 * 972 * Tries to remove all the page table entries which are mapping this 973 * page, used in the pageout path. Caller must hold the page lock. 974 * Return values are: 975 * 976 * SWAP_SUCCESS - we succeeded in removing all mappings 977 * SWAP_AGAIN - we missed a mapping, try again later 978 * SWAP_FAIL - the page is unswappable 979 */ 980int try_to_unmap(struct page *page, int migration) 981{ 982 int ret; 983 984 BUG_ON(!PageLocked(page)); 985 986 if (PageAnon(page)) 987 ret = try_to_unmap_anon(page, migration); 988 else 989 ret = try_to_unmap_file(page, migration); 990 991 if (!page_mapped(page)) 992 ret = SWAP_SUCCESS; 993 return ret; 994} 995 996