rmap.c revision d6e88e671ac12888df2d533dd4ddef705431a32a
1/* 2 * mm/rmap.c - physical to virtual reverse mappings 3 * 4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br> 5 * Released under the General Public License (GPL). 6 * 7 * Simple, low overhead reverse mapping scheme. 8 * Please try to keep this thing as modular as possible. 9 * 10 * Provides methods for unmapping each kind of mapped page: 11 * the anon methods track anonymous pages, and 12 * the file methods track pages belonging to an inode. 13 * 14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001 15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 17 * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004 18 */ 19 20/* 21 * Lock ordering in mm: 22 * 23 * inode->i_mutex (while writing or truncating, not reading or faulting) 24 * inode->i_alloc_sem (vmtruncate_range) 25 * mm->mmap_sem 26 * page->flags PG_locked (lock_page) 27 * mapping->i_mmap_lock 28 * anon_vma->lock 29 * mm->page_table_lock or pte_lock 30 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 31 * swap_lock (in swap_duplicate, swap_info_get) 32 * mmlist_lock (in mmput, drain_mmlist and others) 33 * mapping->private_lock (in __set_page_dirty_buffers) 34 * inode_lock (in set_page_dirty's __mark_inode_dirty) 35 * sb_lock (within inode_lock in fs/fs-writeback.c) 36 * mapping->tree_lock (widely used, in set_page_dirty, 37 * in arch-dependent flush_dcache_mmap_lock, 38 * within inode_lock in __sync_single_inode) 39 */ 40 41#include <linux/mm.h> 42#include <linux/pagemap.h> 43#include <linux/swap.h> 44#include <linux/swapops.h> 45#include <linux/slab.h> 46#include <linux/init.h> 47#include <linux/rmap.h> 48#include <linux/rcupdate.h> 49#include <linux/module.h> 50#include <linux/kallsyms.h> 51 52#include <asm/tlbflush.h> 53 54struct kmem_cache *anon_vma_cachep; 55 56static inline void validate_anon_vma(struct vm_area_struct *find_vma) 57{ 58#ifdef CONFIG_DEBUG_VM 59 struct anon_vma *anon_vma = find_vma->anon_vma; 60 struct vm_area_struct *vma; 61 unsigned int mapcount = 0; 62 int found = 0; 63 64 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 65 mapcount++; 66 BUG_ON(mapcount > 100000); 67 if (vma == find_vma) 68 found = 1; 69 } 70 BUG_ON(!found); 71#endif 72} 73 74/* This must be called under the mmap_sem. */ 75int anon_vma_prepare(struct vm_area_struct *vma) 76{ 77 struct anon_vma *anon_vma = vma->anon_vma; 78 79 might_sleep(); 80 if (unlikely(!anon_vma)) { 81 struct mm_struct *mm = vma->vm_mm; 82 struct anon_vma *allocated, *locked; 83 84 anon_vma = find_mergeable_anon_vma(vma); 85 if (anon_vma) { 86 allocated = NULL; 87 locked = anon_vma; 88 spin_lock(&locked->lock); 89 } else { 90 anon_vma = anon_vma_alloc(); 91 if (unlikely(!anon_vma)) 92 return -ENOMEM; 93 allocated = anon_vma; 94 locked = NULL; 95 } 96 97 /* page_table_lock to protect against threads */ 98 spin_lock(&mm->page_table_lock); 99 if (likely(!vma->anon_vma)) { 100 vma->anon_vma = anon_vma; 101 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 102 allocated = NULL; 103 } 104 spin_unlock(&mm->page_table_lock); 105 106 if (locked) 107 spin_unlock(&locked->lock); 108 if (unlikely(allocated)) 109 anon_vma_free(allocated); 110 } 111 return 0; 112} 113 114void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) 115{ 116 BUG_ON(vma->anon_vma != next->anon_vma); 117 list_del(&next->anon_vma_node); 118} 119 120void __anon_vma_link(struct vm_area_struct *vma) 121{ 122 struct anon_vma *anon_vma = vma->anon_vma; 123 124 if (anon_vma) { 125 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 126 validate_anon_vma(vma); 127 } 128} 129 130void anon_vma_link(struct vm_area_struct *vma) 131{ 132 struct anon_vma *anon_vma = vma->anon_vma; 133 134 if (anon_vma) { 135 spin_lock(&anon_vma->lock); 136 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 137 validate_anon_vma(vma); 138 spin_unlock(&anon_vma->lock); 139 } 140} 141 142void anon_vma_unlink(struct vm_area_struct *vma) 143{ 144 struct anon_vma *anon_vma = vma->anon_vma; 145 int empty; 146 147 if (!anon_vma) 148 return; 149 150 spin_lock(&anon_vma->lock); 151 validate_anon_vma(vma); 152 list_del(&vma->anon_vma_node); 153 154 /* We must garbage collect the anon_vma if it's empty */ 155 empty = list_empty(&anon_vma->head); 156 spin_unlock(&anon_vma->lock); 157 158 if (empty) 159 anon_vma_free(anon_vma); 160} 161 162static void anon_vma_ctor(void *data, struct kmem_cache *cachep, 163 unsigned long flags) 164{ 165 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 166 SLAB_CTOR_CONSTRUCTOR) { 167 struct anon_vma *anon_vma = data; 168 169 spin_lock_init(&anon_vma->lock); 170 INIT_LIST_HEAD(&anon_vma->head); 171 } 172} 173 174void __init anon_vma_init(void) 175{ 176 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 177 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL); 178} 179 180/* 181 * Getting a lock on a stable anon_vma from a page off the LRU is 182 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 183 */ 184static struct anon_vma *page_lock_anon_vma(struct page *page) 185{ 186 struct anon_vma *anon_vma = NULL; 187 unsigned long anon_mapping; 188 189 rcu_read_lock(); 190 anon_mapping = (unsigned long) page->mapping; 191 if (!(anon_mapping & PAGE_MAPPING_ANON)) 192 goto out; 193 if (!page_mapped(page)) 194 goto out; 195 196 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 197 spin_lock(&anon_vma->lock); 198out: 199 rcu_read_unlock(); 200 return anon_vma; 201} 202 203/* 204 * At what user virtual address is page expected in vma? 205 */ 206static inline unsigned long 207vma_address(struct page *page, struct vm_area_struct *vma) 208{ 209 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 210 unsigned long address; 211 212 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 213 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 214 /* page should be within any vma from prio_tree_next */ 215 BUG_ON(!PageAnon(page)); 216 return -EFAULT; 217 } 218 return address; 219} 220 221/* 222 * At what user virtual address is page expected in vma? checking that the 223 * page matches the vma: currently only used on anon pages, by unuse_vma; 224 */ 225unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 226{ 227 if (PageAnon(page)) { 228 if ((void *)vma->anon_vma != 229 (void *)page->mapping - PAGE_MAPPING_ANON) 230 return -EFAULT; 231 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 232 if (!vma->vm_file || 233 vma->vm_file->f_mapping != page->mapping) 234 return -EFAULT; 235 } else 236 return -EFAULT; 237 return vma_address(page, vma); 238} 239 240/* 241 * Check that @page is mapped at @address into @mm. 242 * 243 * On success returns with pte mapped and locked. 244 */ 245pte_t *page_check_address(struct page *page, struct mm_struct *mm, 246 unsigned long address, spinlock_t **ptlp) 247{ 248 pgd_t *pgd; 249 pud_t *pud; 250 pmd_t *pmd; 251 pte_t *pte; 252 spinlock_t *ptl; 253 254 pgd = pgd_offset(mm, address); 255 if (!pgd_present(*pgd)) 256 return NULL; 257 258 pud = pud_offset(pgd, address); 259 if (!pud_present(*pud)) 260 return NULL; 261 262 pmd = pmd_offset(pud, address); 263 if (!pmd_present(*pmd)) 264 return NULL; 265 266 pte = pte_offset_map(pmd, address); 267 /* Make a quick check before getting the lock */ 268 if (!pte_present(*pte)) { 269 pte_unmap(pte); 270 return NULL; 271 } 272 273 ptl = pte_lockptr(mm, pmd); 274 spin_lock(ptl); 275 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { 276 *ptlp = ptl; 277 return pte; 278 } 279 pte_unmap_unlock(pte, ptl); 280 return NULL; 281} 282 283/* 284 * Subfunctions of page_referenced: page_referenced_one called 285 * repeatedly from either page_referenced_anon or page_referenced_file. 286 */ 287static int page_referenced_one(struct page *page, 288 struct vm_area_struct *vma, unsigned int *mapcount) 289{ 290 struct mm_struct *mm = vma->vm_mm; 291 unsigned long address; 292 pte_t *pte; 293 spinlock_t *ptl; 294 int referenced = 0; 295 296 address = vma_address(page, vma); 297 if (address == -EFAULT) 298 goto out; 299 300 pte = page_check_address(page, mm, address, &ptl); 301 if (!pte) 302 goto out; 303 304 if (ptep_clear_flush_young(vma, address, pte)) 305 referenced++; 306 307 /* Pretend the page is referenced if the task has the 308 swap token and is in the middle of a page fault. */ 309 if (mm != current->mm && has_swap_token(mm) && 310 rwsem_is_locked(&mm->mmap_sem)) 311 referenced++; 312 313 (*mapcount)--; 314 pte_unmap_unlock(pte, ptl); 315out: 316 return referenced; 317} 318 319static int page_referenced_anon(struct page *page) 320{ 321 unsigned int mapcount; 322 struct anon_vma *anon_vma; 323 struct vm_area_struct *vma; 324 int referenced = 0; 325 326 anon_vma = page_lock_anon_vma(page); 327 if (!anon_vma) 328 return referenced; 329 330 mapcount = page_mapcount(page); 331 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 332 referenced += page_referenced_one(page, vma, &mapcount); 333 if (!mapcount) 334 break; 335 } 336 spin_unlock(&anon_vma->lock); 337 return referenced; 338} 339 340/** 341 * page_referenced_file - referenced check for object-based rmap 342 * @page: the page we're checking references on. 343 * 344 * For an object-based mapped page, find all the places it is mapped and 345 * check/clear the referenced flag. This is done by following the page->mapping 346 * pointer, then walking the chain of vmas it holds. It returns the number 347 * of references it found. 348 * 349 * This function is only called from page_referenced for object-based pages. 350 */ 351static int page_referenced_file(struct page *page) 352{ 353 unsigned int mapcount; 354 struct address_space *mapping = page->mapping; 355 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 356 struct vm_area_struct *vma; 357 struct prio_tree_iter iter; 358 int referenced = 0; 359 360 /* 361 * The caller's checks on page->mapping and !PageAnon have made 362 * sure that this is a file page: the check for page->mapping 363 * excludes the case just before it gets set on an anon page. 364 */ 365 BUG_ON(PageAnon(page)); 366 367 /* 368 * The page lock not only makes sure that page->mapping cannot 369 * suddenly be NULLified by truncation, it makes sure that the 370 * structure at mapping cannot be freed and reused yet, 371 * so we can safely take mapping->i_mmap_lock. 372 */ 373 BUG_ON(!PageLocked(page)); 374 375 spin_lock(&mapping->i_mmap_lock); 376 377 /* 378 * i_mmap_lock does not stabilize mapcount at all, but mapcount 379 * is more likely to be accurate if we note it after spinning. 380 */ 381 mapcount = page_mapcount(page); 382 383 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 384 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) 385 == (VM_LOCKED|VM_MAYSHARE)) { 386 referenced++; 387 break; 388 } 389 referenced += page_referenced_one(page, vma, &mapcount); 390 if (!mapcount) 391 break; 392 } 393 394 spin_unlock(&mapping->i_mmap_lock); 395 return referenced; 396} 397 398/** 399 * page_referenced - test if the page was referenced 400 * @page: the page to test 401 * @is_locked: caller holds lock on the page 402 * 403 * Quick test_and_clear_referenced for all mappings to a page, 404 * returns the number of ptes which referenced the page. 405 */ 406int page_referenced(struct page *page, int is_locked) 407{ 408 int referenced = 0; 409 410 if (page_test_and_clear_young(page)) 411 referenced++; 412 413 if (TestClearPageReferenced(page)) 414 referenced++; 415 416 if (page_mapped(page) && page->mapping) { 417 if (PageAnon(page)) 418 referenced += page_referenced_anon(page); 419 else if (is_locked) 420 referenced += page_referenced_file(page); 421 else if (TestSetPageLocked(page)) 422 referenced++; 423 else { 424 if (page->mapping) 425 referenced += page_referenced_file(page); 426 unlock_page(page); 427 } 428 } 429 return referenced; 430} 431 432static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) 433{ 434 struct mm_struct *mm = vma->vm_mm; 435 unsigned long address; 436 pte_t *pte; 437 spinlock_t *ptl; 438 int ret = 0; 439 440 address = vma_address(page, vma); 441 if (address == -EFAULT) 442 goto out; 443 444 pte = page_check_address(page, mm, address, &ptl); 445 if (!pte) 446 goto out; 447 448 if (pte_dirty(*pte) || pte_write(*pte)) { 449 pte_t entry; 450 451 flush_cache_page(vma, address, pte_pfn(*pte)); 452 entry = ptep_clear_flush(vma, address, pte); 453 entry = pte_wrprotect(entry); 454 entry = pte_mkclean(entry); 455 set_pte_at(mm, address, pte, entry); 456 lazy_mmu_prot_update(entry); 457 ret = 1; 458 } 459 460 pte_unmap_unlock(pte, ptl); 461out: 462 return ret; 463} 464 465static int page_mkclean_file(struct address_space *mapping, struct page *page) 466{ 467 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 468 struct vm_area_struct *vma; 469 struct prio_tree_iter iter; 470 int ret = 0; 471 472 BUG_ON(PageAnon(page)); 473 474 spin_lock(&mapping->i_mmap_lock); 475 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 476 if (vma->vm_flags & VM_SHARED) 477 ret += page_mkclean_one(page, vma); 478 } 479 spin_unlock(&mapping->i_mmap_lock); 480 return ret; 481} 482 483int page_mkclean(struct page *page) 484{ 485 int ret = 0; 486 487 BUG_ON(!PageLocked(page)); 488 489 if (page_mapped(page)) { 490 struct address_space *mapping = page_mapping(page); 491 if (mapping) 492 ret = page_mkclean_file(mapping, page); 493 } 494 if (page_test_and_clear_dirty(page)) 495 ret = 1; 496 497 return ret; 498} 499 500/** 501 * page_set_anon_rmap - setup new anonymous rmap 502 * @page: the page to add the mapping to 503 * @vma: the vm area in which the mapping is added 504 * @address: the user virtual address mapped 505 */ 506static void __page_set_anon_rmap(struct page *page, 507 struct vm_area_struct *vma, unsigned long address) 508{ 509 struct anon_vma *anon_vma = vma->anon_vma; 510 511 BUG_ON(!anon_vma); 512 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 513 page->mapping = (struct address_space *) anon_vma; 514 515 page->index = linear_page_index(vma, address); 516 517 /* 518 * nr_mapped state can be updated without turning off 519 * interrupts because it is not modified via interrupt. 520 */ 521 __inc_zone_page_state(page, NR_ANON_PAGES); 522} 523 524/** 525 * page_add_anon_rmap - add pte mapping to an anonymous page 526 * @page: the page to add the mapping to 527 * @vma: the vm area in which the mapping is added 528 * @address: the user virtual address mapped 529 * 530 * The caller needs to hold the pte lock. 531 */ 532void page_add_anon_rmap(struct page *page, 533 struct vm_area_struct *vma, unsigned long address) 534{ 535 if (atomic_inc_and_test(&page->_mapcount)) 536 __page_set_anon_rmap(page, vma, address); 537 /* else checking page index and mapping is racy */ 538} 539 540/* 541 * page_add_new_anon_rmap - add pte mapping to a new anonymous page 542 * @page: the page to add the mapping to 543 * @vma: the vm area in which the mapping is added 544 * @address: the user virtual address mapped 545 * 546 * Same as page_add_anon_rmap but must only be called on *new* pages. 547 * This means the inc-and-test can be bypassed. 548 */ 549void page_add_new_anon_rmap(struct page *page, 550 struct vm_area_struct *vma, unsigned long address) 551{ 552 atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ 553 __page_set_anon_rmap(page, vma, address); 554} 555 556/** 557 * page_add_file_rmap - add pte mapping to a file page 558 * @page: the page to add the mapping to 559 * 560 * The caller needs to hold the pte lock. 561 */ 562void page_add_file_rmap(struct page *page) 563{ 564 if (atomic_inc_and_test(&page->_mapcount)) 565 __inc_zone_page_state(page, NR_FILE_MAPPED); 566} 567 568/** 569 * page_remove_rmap - take down pte mapping from a page 570 * @page: page to remove mapping from 571 * 572 * The caller needs to hold the pte lock. 573 */ 574void page_remove_rmap(struct page *page, struct vm_area_struct *vma) 575{ 576 if (atomic_add_negative(-1, &page->_mapcount)) { 577 if (unlikely(page_mapcount(page) < 0)) { 578 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); 579 printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page)); 580 printk (KERN_EMERG " page->flags = %lx\n", page->flags); 581 printk (KERN_EMERG " page->count = %x\n", page_count(page)); 582 printk (KERN_EMERG " page->mapping = %p\n", page->mapping); 583 print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); 584 if (vma->vm_ops) 585 print_symbol (KERN_EMERG " vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage); 586 if (vma->vm_file && vma->vm_file->f_op) 587 print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap); 588 BUG(); 589 } 590 591 /* 592 * It would be tidy to reset the PageAnon mapping here, 593 * but that might overwrite a racing page_add_anon_rmap 594 * which increments mapcount after us but sets mapping 595 * before us: so leave the reset to free_hot_cold_page, 596 * and remember that it's only reliable while mapped. 597 * Leaving it set also helps swapoff to reinstate ptes 598 * faster for those pages still in swapcache. 599 */ 600 if (page_test_and_clear_dirty(page)) 601 set_page_dirty(page); 602 __dec_zone_page_state(page, 603 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); 604 } 605} 606 607/* 608 * Subfunctions of try_to_unmap: try_to_unmap_one called 609 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 610 */ 611static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 612 int migration) 613{ 614 struct mm_struct *mm = vma->vm_mm; 615 unsigned long address; 616 pte_t *pte; 617 pte_t pteval; 618 spinlock_t *ptl; 619 int ret = SWAP_AGAIN; 620 621 address = vma_address(page, vma); 622 if (address == -EFAULT) 623 goto out; 624 625 pte = page_check_address(page, mm, address, &ptl); 626 if (!pte) 627 goto out; 628 629 /* 630 * If the page is mlock()d, we cannot swap it out. 631 * If it's recently referenced (perhaps page_referenced 632 * skipped over this mm) then we should reactivate it. 633 */ 634 if (!migration && ((vma->vm_flags & VM_LOCKED) || 635 (ptep_clear_flush_young(vma, address, pte)))) { 636 ret = SWAP_FAIL; 637 goto out_unmap; 638 } 639 640 /* Nuke the page table entry. */ 641 flush_cache_page(vma, address, page_to_pfn(page)); 642 pteval = ptep_clear_flush(vma, address, pte); 643 644 /* Move the dirty bit to the physical page now the pte is gone. */ 645 if (pte_dirty(pteval)) 646 set_page_dirty(page); 647 648 /* Update high watermark before we lower rss */ 649 update_hiwater_rss(mm); 650 651 if (PageAnon(page)) { 652 swp_entry_t entry = { .val = page_private(page) }; 653 654 if (PageSwapCache(page)) { 655 /* 656 * Store the swap location in the pte. 657 * See handle_pte_fault() ... 658 */ 659 swap_duplicate(entry); 660 if (list_empty(&mm->mmlist)) { 661 spin_lock(&mmlist_lock); 662 if (list_empty(&mm->mmlist)) 663 list_add(&mm->mmlist, &init_mm.mmlist); 664 spin_unlock(&mmlist_lock); 665 } 666 dec_mm_counter(mm, anon_rss); 667#ifdef CONFIG_MIGRATION 668 } else { 669 /* 670 * Store the pfn of the page in a special migration 671 * pte. do_swap_page() will wait until the migration 672 * pte is removed and then restart fault handling. 673 */ 674 BUG_ON(!migration); 675 entry = make_migration_entry(page, pte_write(pteval)); 676#endif 677 } 678 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 679 BUG_ON(pte_file(*pte)); 680 } else 681#ifdef CONFIG_MIGRATION 682 if (migration) { 683 /* Establish migration entry for a file page */ 684 swp_entry_t entry; 685 entry = make_migration_entry(page, pte_write(pteval)); 686 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 687 } else 688#endif 689 dec_mm_counter(mm, file_rss); 690 691 692 page_remove_rmap(page, vma); 693 page_cache_release(page); 694 695out_unmap: 696 pte_unmap_unlock(pte, ptl); 697out: 698 return ret; 699} 700 701/* 702 * objrmap doesn't work for nonlinear VMAs because the assumption that 703 * offset-into-file correlates with offset-into-virtual-addresses does not hold. 704 * Consequently, given a particular page and its ->index, we cannot locate the 705 * ptes which are mapping that page without an exhaustive linear search. 706 * 707 * So what this code does is a mini "virtual scan" of each nonlinear VMA which 708 * maps the file to which the target page belongs. The ->vm_private_data field 709 * holds the current cursor into that scan. Successive searches will circulate 710 * around the vma's virtual address space. 711 * 712 * So as more replacement pressure is applied to the pages in a nonlinear VMA, 713 * more scanning pressure is placed against them as well. Eventually pages 714 * will become fully unmapped and are eligible for eviction. 715 * 716 * For very sparsely populated VMAs this is a little inefficient - chances are 717 * there there won't be many ptes located within the scan cluster. In this case 718 * maybe we could scan further - to the end of the pte page, perhaps. 719 */ 720#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) 721#define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) 722 723static void try_to_unmap_cluster(unsigned long cursor, 724 unsigned int *mapcount, struct vm_area_struct *vma) 725{ 726 struct mm_struct *mm = vma->vm_mm; 727 pgd_t *pgd; 728 pud_t *pud; 729 pmd_t *pmd; 730 pte_t *pte; 731 pte_t pteval; 732 spinlock_t *ptl; 733 struct page *page; 734 unsigned long address; 735 unsigned long end; 736 737 address = (vma->vm_start + cursor) & CLUSTER_MASK; 738 end = address + CLUSTER_SIZE; 739 if (address < vma->vm_start) 740 address = vma->vm_start; 741 if (end > vma->vm_end) 742 end = vma->vm_end; 743 744 pgd = pgd_offset(mm, address); 745 if (!pgd_present(*pgd)) 746 return; 747 748 pud = pud_offset(pgd, address); 749 if (!pud_present(*pud)) 750 return; 751 752 pmd = pmd_offset(pud, address); 753 if (!pmd_present(*pmd)) 754 return; 755 756 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 757 758 /* Update high watermark before we lower rss */ 759 update_hiwater_rss(mm); 760 761 for (; address < end; pte++, address += PAGE_SIZE) { 762 if (!pte_present(*pte)) 763 continue; 764 page = vm_normal_page(vma, address, *pte); 765 BUG_ON(!page || PageAnon(page)); 766 767 if (ptep_clear_flush_young(vma, address, pte)) 768 continue; 769 770 /* Nuke the page table entry. */ 771 flush_cache_page(vma, address, pte_pfn(*pte)); 772 pteval = ptep_clear_flush(vma, address, pte); 773 774 /* If nonlinear, store the file page offset in the pte. */ 775 if (page->index != linear_page_index(vma, address)) 776 set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); 777 778 /* Move the dirty bit to the physical page now the pte is gone. */ 779 if (pte_dirty(pteval)) 780 set_page_dirty(page); 781 782 page_remove_rmap(page, vma); 783 page_cache_release(page); 784 dec_mm_counter(mm, file_rss); 785 (*mapcount)--; 786 } 787 pte_unmap_unlock(pte - 1, ptl); 788} 789 790static int try_to_unmap_anon(struct page *page, int migration) 791{ 792 struct anon_vma *anon_vma; 793 struct vm_area_struct *vma; 794 int ret = SWAP_AGAIN; 795 796 anon_vma = page_lock_anon_vma(page); 797 if (!anon_vma) 798 return ret; 799 800 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 801 ret = try_to_unmap_one(page, vma, migration); 802 if (ret == SWAP_FAIL || !page_mapped(page)) 803 break; 804 } 805 spin_unlock(&anon_vma->lock); 806 return ret; 807} 808 809/** 810 * try_to_unmap_file - unmap file page using the object-based rmap method 811 * @page: the page to unmap 812 * 813 * Find all the mappings of a page using the mapping pointer and the vma chains 814 * contained in the address_space struct it points to. 815 * 816 * This function is only called from try_to_unmap for object-based pages. 817 */ 818static int try_to_unmap_file(struct page *page, int migration) 819{ 820 struct address_space *mapping = page->mapping; 821 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 822 struct vm_area_struct *vma; 823 struct prio_tree_iter iter; 824 int ret = SWAP_AGAIN; 825 unsigned long cursor; 826 unsigned long max_nl_cursor = 0; 827 unsigned long max_nl_size = 0; 828 unsigned int mapcount; 829 830 spin_lock(&mapping->i_mmap_lock); 831 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 832 ret = try_to_unmap_one(page, vma, migration); 833 if (ret == SWAP_FAIL || !page_mapped(page)) 834 goto out; 835 } 836 837 if (list_empty(&mapping->i_mmap_nonlinear)) 838 goto out; 839 840 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 841 shared.vm_set.list) { 842 if ((vma->vm_flags & VM_LOCKED) && !migration) 843 continue; 844 cursor = (unsigned long) vma->vm_private_data; 845 if (cursor > max_nl_cursor) 846 max_nl_cursor = cursor; 847 cursor = vma->vm_end - vma->vm_start; 848 if (cursor > max_nl_size) 849 max_nl_size = cursor; 850 } 851 852 if (max_nl_size == 0) { /* any nonlinears locked or reserved */ 853 ret = SWAP_FAIL; 854 goto out; 855 } 856 857 /* 858 * We don't try to search for this page in the nonlinear vmas, 859 * and page_referenced wouldn't have found it anyway. Instead 860 * just walk the nonlinear vmas trying to age and unmap some. 861 * The mapcount of the page we came in with is irrelevant, 862 * but even so use it as a guide to how hard we should try? 863 */ 864 mapcount = page_mapcount(page); 865 if (!mapcount) 866 goto out; 867 cond_resched_lock(&mapping->i_mmap_lock); 868 869 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; 870 if (max_nl_cursor == 0) 871 max_nl_cursor = CLUSTER_SIZE; 872 873 do { 874 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 875 shared.vm_set.list) { 876 if ((vma->vm_flags & VM_LOCKED) && !migration) 877 continue; 878 cursor = (unsigned long) vma->vm_private_data; 879 while ( cursor < max_nl_cursor && 880 cursor < vma->vm_end - vma->vm_start) { 881 try_to_unmap_cluster(cursor, &mapcount, vma); 882 cursor += CLUSTER_SIZE; 883 vma->vm_private_data = (void *) cursor; 884 if ((int)mapcount <= 0) 885 goto out; 886 } 887 vma->vm_private_data = (void *) max_nl_cursor; 888 } 889 cond_resched_lock(&mapping->i_mmap_lock); 890 max_nl_cursor += CLUSTER_SIZE; 891 } while (max_nl_cursor <= max_nl_size); 892 893 /* 894 * Don't loop forever (perhaps all the remaining pages are 895 * in locked vmas). Reset cursor on all unreserved nonlinear 896 * vmas, now forgetting on which ones it had fallen behind. 897 */ 898 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) 899 vma->vm_private_data = NULL; 900out: 901 spin_unlock(&mapping->i_mmap_lock); 902 return ret; 903} 904 905/** 906 * try_to_unmap - try to remove all page table mappings to a page 907 * @page: the page to get unmapped 908 * 909 * Tries to remove all the page table entries which are mapping this 910 * page, used in the pageout path. Caller must hold the page lock. 911 * Return values are: 912 * 913 * SWAP_SUCCESS - we succeeded in removing all mappings 914 * SWAP_AGAIN - we missed a mapping, try again later 915 * SWAP_FAIL - the page is unswappable 916 */ 917int try_to_unmap(struct page *page, int migration) 918{ 919 int ret; 920 921 BUG_ON(!PageLocked(page)); 922 923 if (PageAnon(page)) 924 ret = try_to_unmap_anon(page, migration); 925 else 926 ret = try_to_unmap_file(page, migration); 927 928 if (!page_mapped(page)) 929 ret = SWAP_SUCCESS; 930 return ret; 931} 932 933