rmap.c revision fcc234f888ba2365c44ba0507eb8a18eebf1f594
1/* 2 * mm/rmap.c - physical to virtual reverse mappings 3 * 4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br> 5 * Released under the General Public License (GPL). 6 * 7 * Simple, low overhead reverse mapping scheme. 8 * Please try to keep this thing as modular as possible. 9 * 10 * Provides methods for unmapping each kind of mapped page: 11 * the anon methods track anonymous pages, and 12 * the file methods track pages belonging to an inode. 13 * 14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001 15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 17 * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004 18 */ 19 20/* 21 * Lock ordering in mm: 22 * 23 * inode->i_mutex (while writing or truncating, not reading or faulting) 24 * inode->i_alloc_sem 25 * 26 * When a page fault occurs in writing from user to file, down_read 27 * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within 28 * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never 29 * taken together; in truncation, i_mutex is taken outermost. 30 * 31 * mm->mmap_sem 32 * page->flags PG_locked (lock_page) 33 * mapping->i_mmap_lock 34 * anon_vma->lock 35 * mm->page_table_lock or pte_lock 36 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 37 * swap_lock (in swap_duplicate, swap_info_get) 38 * mmlist_lock (in mmput, drain_mmlist and others) 39 * mapping->private_lock (in __set_page_dirty_buffers) 40 * inode_lock (in set_page_dirty's __mark_inode_dirty) 41 * sb_lock (within inode_lock in fs/fs-writeback.c) 42 * mapping->tree_lock (widely used, in set_page_dirty, 43 * in arch-dependent flush_dcache_mmap_lock, 44 * within inode_lock in __sync_single_inode) 45 */ 46 47#include <linux/mm.h> 48#include <linux/pagemap.h> 49#include <linux/swap.h> 50#include <linux/swapops.h> 51#include <linux/slab.h> 52#include <linux/init.h> 53#include <linux/rmap.h> 54#include <linux/rcupdate.h> 55#include <linux/module.h> 56 57#include <asm/tlbflush.h> 58 59//#define RMAP_DEBUG /* can be enabled only for debugging */ 60 61struct kmem_cache *anon_vma_cachep; 62 63static inline void validate_anon_vma(struct vm_area_struct *find_vma) 64{ 65#ifdef RMAP_DEBUG 66 struct anon_vma *anon_vma = find_vma->anon_vma; 67 struct vm_area_struct *vma; 68 unsigned int mapcount = 0; 69 int found = 0; 70 71 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 72 mapcount++; 73 BUG_ON(mapcount > 100000); 74 if (vma == find_vma) 75 found = 1; 76 } 77 BUG_ON(!found); 78#endif 79} 80 81/* This must be called under the mmap_sem. */ 82int anon_vma_prepare(struct vm_area_struct *vma) 83{ 84 struct anon_vma *anon_vma = vma->anon_vma; 85 86 might_sleep(); 87 if (unlikely(!anon_vma)) { 88 struct mm_struct *mm = vma->vm_mm; 89 struct anon_vma *allocated, *locked; 90 91 anon_vma = find_mergeable_anon_vma(vma); 92 if (anon_vma) { 93 allocated = NULL; 94 locked = anon_vma; 95 spin_lock(&locked->lock); 96 } else { 97 anon_vma = anon_vma_alloc(); 98 if (unlikely(!anon_vma)) 99 return -ENOMEM; 100 allocated = anon_vma; 101 locked = NULL; 102 } 103 104 /* page_table_lock to protect against threads */ 105 spin_lock(&mm->page_table_lock); 106 if (likely(!vma->anon_vma)) { 107 vma->anon_vma = anon_vma; 108 list_add(&vma->anon_vma_node, &anon_vma->head); 109 allocated = NULL; 110 } 111 spin_unlock(&mm->page_table_lock); 112 113 if (locked) 114 spin_unlock(&locked->lock); 115 if (unlikely(allocated)) 116 anon_vma_free(allocated); 117 } 118 return 0; 119} 120 121void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) 122{ 123 BUG_ON(vma->anon_vma != next->anon_vma); 124 list_del(&next->anon_vma_node); 125} 126 127void __anon_vma_link(struct vm_area_struct *vma) 128{ 129 struct anon_vma *anon_vma = vma->anon_vma; 130 131 if (anon_vma) { 132 list_add(&vma->anon_vma_node, &anon_vma->head); 133 validate_anon_vma(vma); 134 } 135} 136 137void anon_vma_link(struct vm_area_struct *vma) 138{ 139 struct anon_vma *anon_vma = vma->anon_vma; 140 141 if (anon_vma) { 142 spin_lock(&anon_vma->lock); 143 list_add(&vma->anon_vma_node, &anon_vma->head); 144 validate_anon_vma(vma); 145 spin_unlock(&anon_vma->lock); 146 } 147} 148 149void anon_vma_unlink(struct vm_area_struct *vma) 150{ 151 struct anon_vma *anon_vma = vma->anon_vma; 152 int empty; 153 154 if (!anon_vma) 155 return; 156 157 spin_lock(&anon_vma->lock); 158 validate_anon_vma(vma); 159 list_del(&vma->anon_vma_node); 160 161 /* We must garbage collect the anon_vma if it's empty */ 162 empty = list_empty(&anon_vma->head); 163 spin_unlock(&anon_vma->lock); 164 165 if (empty) 166 anon_vma_free(anon_vma); 167} 168 169static void anon_vma_ctor(void *data, struct kmem_cache *cachep, 170 unsigned long flags) 171{ 172 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 173 SLAB_CTOR_CONSTRUCTOR) { 174 struct anon_vma *anon_vma = data; 175 176 spin_lock_init(&anon_vma->lock); 177 INIT_LIST_HEAD(&anon_vma->head); 178 } 179} 180 181void __init anon_vma_init(void) 182{ 183 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 184 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL); 185} 186 187/* 188 * Getting a lock on a stable anon_vma from a page off the LRU is 189 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 190 */ 191static struct anon_vma *page_lock_anon_vma(struct page *page) 192{ 193 struct anon_vma *anon_vma = NULL; 194 unsigned long anon_mapping; 195 196 rcu_read_lock(); 197 anon_mapping = (unsigned long) page->mapping; 198 if (!(anon_mapping & PAGE_MAPPING_ANON)) 199 goto out; 200 if (!page_mapped(page)) 201 goto out; 202 203 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 204 spin_lock(&anon_vma->lock); 205out: 206 rcu_read_unlock(); 207 return anon_vma; 208} 209 210#ifdef CONFIG_MIGRATION 211/* 212 * Remove an anonymous page from swap replacing the swap pte's 213 * through real pte's pointing to valid pages and then releasing 214 * the page from the swap cache. 215 * 216 * Must hold page lock on page and mmap_sem of one vma that contains 217 * the page. 218 */ 219void remove_from_swap(struct page *page) 220{ 221 struct anon_vma *anon_vma; 222 struct vm_area_struct *vma; 223 unsigned long mapping; 224 225 if (!PageSwapCache(page)) 226 return; 227 228 mapping = (unsigned long)page->mapping; 229 230 if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) 231 return; 232 233 /* 234 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. 235 */ 236 anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); 237 spin_lock(&anon_vma->lock); 238 239 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) 240 remove_vma_swap(vma, page); 241 242 spin_unlock(&anon_vma->lock); 243 delete_from_swap_cache(page); 244} 245EXPORT_SYMBOL(remove_from_swap); 246#endif 247 248/* 249 * At what user virtual address is page expected in vma? 250 */ 251static inline unsigned long 252vma_address(struct page *page, struct vm_area_struct *vma) 253{ 254 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 255 unsigned long address; 256 257 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 258 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 259 /* page should be within any vma from prio_tree_next */ 260 BUG_ON(!PageAnon(page)); 261 return -EFAULT; 262 } 263 return address; 264} 265 266/* 267 * At what user virtual address is page expected in vma? checking that the 268 * page matches the vma: currently only used on anon pages, by unuse_vma; 269 */ 270unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 271{ 272 if (PageAnon(page)) { 273 if ((void *)vma->anon_vma != 274 (void *)page->mapping - PAGE_MAPPING_ANON) 275 return -EFAULT; 276 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 277 if (!vma->vm_file || 278 vma->vm_file->f_mapping != page->mapping) 279 return -EFAULT; 280 } else 281 return -EFAULT; 282 return vma_address(page, vma); 283} 284 285/* 286 * Check that @page is mapped at @address into @mm. 287 * 288 * On success returns with pte mapped and locked. 289 */ 290pte_t *page_check_address(struct page *page, struct mm_struct *mm, 291 unsigned long address, spinlock_t **ptlp) 292{ 293 pgd_t *pgd; 294 pud_t *pud; 295 pmd_t *pmd; 296 pte_t *pte; 297 spinlock_t *ptl; 298 299 pgd = pgd_offset(mm, address); 300 if (!pgd_present(*pgd)) 301 return NULL; 302 303 pud = pud_offset(pgd, address); 304 if (!pud_present(*pud)) 305 return NULL; 306 307 pmd = pmd_offset(pud, address); 308 if (!pmd_present(*pmd)) 309 return NULL; 310 311 pte = pte_offset_map(pmd, address); 312 /* Make a quick check before getting the lock */ 313 if (!pte_present(*pte)) { 314 pte_unmap(pte); 315 return NULL; 316 } 317 318 ptl = pte_lockptr(mm, pmd); 319 spin_lock(ptl); 320 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { 321 *ptlp = ptl; 322 return pte; 323 } 324 pte_unmap_unlock(pte, ptl); 325 return NULL; 326} 327 328/* 329 * Subfunctions of page_referenced: page_referenced_one called 330 * repeatedly from either page_referenced_anon or page_referenced_file. 331 */ 332static int page_referenced_one(struct page *page, 333 struct vm_area_struct *vma, unsigned int *mapcount) 334{ 335 struct mm_struct *mm = vma->vm_mm; 336 unsigned long address; 337 pte_t *pte; 338 spinlock_t *ptl; 339 int referenced = 0; 340 341 address = vma_address(page, vma); 342 if (address == -EFAULT) 343 goto out; 344 345 pte = page_check_address(page, mm, address, &ptl); 346 if (!pte) 347 goto out; 348 349 if (ptep_clear_flush_young(vma, address, pte)) 350 referenced++; 351 352 /* Pretend the page is referenced if the task has the 353 swap token and is in the middle of a page fault. */ 354 if (mm != current->mm && has_swap_token(mm) && 355 rwsem_is_locked(&mm->mmap_sem)) 356 referenced++; 357 358 (*mapcount)--; 359 pte_unmap_unlock(pte, ptl); 360out: 361 return referenced; 362} 363 364static int page_referenced_anon(struct page *page) 365{ 366 unsigned int mapcount; 367 struct anon_vma *anon_vma; 368 struct vm_area_struct *vma; 369 int referenced = 0; 370 371 anon_vma = page_lock_anon_vma(page); 372 if (!anon_vma) 373 return referenced; 374 375 mapcount = page_mapcount(page); 376 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 377 referenced += page_referenced_one(page, vma, &mapcount); 378 if (!mapcount) 379 break; 380 } 381 spin_unlock(&anon_vma->lock); 382 return referenced; 383} 384 385/** 386 * page_referenced_file - referenced check for object-based rmap 387 * @page: the page we're checking references on. 388 * 389 * For an object-based mapped page, find all the places it is mapped and 390 * check/clear the referenced flag. This is done by following the page->mapping 391 * pointer, then walking the chain of vmas it holds. It returns the number 392 * of references it found. 393 * 394 * This function is only called from page_referenced for object-based pages. 395 */ 396static int page_referenced_file(struct page *page) 397{ 398 unsigned int mapcount; 399 struct address_space *mapping = page->mapping; 400 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 401 struct vm_area_struct *vma; 402 struct prio_tree_iter iter; 403 int referenced = 0; 404 405 /* 406 * The caller's checks on page->mapping and !PageAnon have made 407 * sure that this is a file page: the check for page->mapping 408 * excludes the case just before it gets set on an anon page. 409 */ 410 BUG_ON(PageAnon(page)); 411 412 /* 413 * The page lock not only makes sure that page->mapping cannot 414 * suddenly be NULLified by truncation, it makes sure that the 415 * structure at mapping cannot be freed and reused yet, 416 * so we can safely take mapping->i_mmap_lock. 417 */ 418 BUG_ON(!PageLocked(page)); 419 420 spin_lock(&mapping->i_mmap_lock); 421 422 /* 423 * i_mmap_lock does not stabilize mapcount at all, but mapcount 424 * is more likely to be accurate if we note it after spinning. 425 */ 426 mapcount = page_mapcount(page); 427 428 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 429 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) 430 == (VM_LOCKED|VM_MAYSHARE)) { 431 referenced++; 432 break; 433 } 434 referenced += page_referenced_one(page, vma, &mapcount); 435 if (!mapcount) 436 break; 437 } 438 439 spin_unlock(&mapping->i_mmap_lock); 440 return referenced; 441} 442 443/** 444 * page_referenced - test if the page was referenced 445 * @page: the page to test 446 * @is_locked: caller holds lock on the page 447 * 448 * Quick test_and_clear_referenced for all mappings to a page, 449 * returns the number of ptes which referenced the page. 450 */ 451int page_referenced(struct page *page, int is_locked) 452{ 453 int referenced = 0; 454 455 if (page_test_and_clear_young(page)) 456 referenced++; 457 458 if (TestClearPageReferenced(page)) 459 referenced++; 460 461 if (page_mapped(page) && page->mapping) { 462 if (PageAnon(page)) 463 referenced += page_referenced_anon(page); 464 else if (is_locked) 465 referenced += page_referenced_file(page); 466 else if (TestSetPageLocked(page)) 467 referenced++; 468 else { 469 if (page->mapping) 470 referenced += page_referenced_file(page); 471 unlock_page(page); 472 } 473 } 474 return referenced; 475} 476 477/** 478 * page_set_anon_rmap - setup new anonymous rmap 479 * @page: the page to add the mapping to 480 * @vma: the vm area in which the mapping is added 481 * @address: the user virtual address mapped 482 */ 483static void __page_set_anon_rmap(struct page *page, 484 struct vm_area_struct *vma, unsigned long address) 485{ 486 struct anon_vma *anon_vma = vma->anon_vma; 487 488 BUG_ON(!anon_vma); 489 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 490 page->mapping = (struct address_space *) anon_vma; 491 492 page->index = linear_page_index(vma, address); 493 494 /* 495 * nr_mapped state can be updated without turning off 496 * interrupts because it is not modified via interrupt. 497 */ 498 __inc_page_state(nr_mapped); 499} 500 501/** 502 * page_add_anon_rmap - add pte mapping to an anonymous page 503 * @page: the page to add the mapping to 504 * @vma: the vm area in which the mapping is added 505 * @address: the user virtual address mapped 506 * 507 * The caller needs to hold the pte lock. 508 */ 509void page_add_anon_rmap(struct page *page, 510 struct vm_area_struct *vma, unsigned long address) 511{ 512 if (atomic_inc_and_test(&page->_mapcount)) 513 __page_set_anon_rmap(page, vma, address); 514 /* else checking page index and mapping is racy */ 515} 516 517/* 518 * page_add_new_anon_rmap - add pte mapping to a new anonymous page 519 * @page: the page to add the mapping to 520 * @vma: the vm area in which the mapping is added 521 * @address: the user virtual address mapped 522 * 523 * Same as page_add_anon_rmap but must only be called on *new* pages. 524 * This means the inc-and-test can be bypassed. 525 */ 526void page_add_new_anon_rmap(struct page *page, 527 struct vm_area_struct *vma, unsigned long address) 528{ 529 atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ 530 __page_set_anon_rmap(page, vma, address); 531} 532 533/** 534 * page_add_file_rmap - add pte mapping to a file page 535 * @page: the page to add the mapping to 536 * 537 * The caller needs to hold the pte lock. 538 */ 539void page_add_file_rmap(struct page *page) 540{ 541 if (atomic_inc_and_test(&page->_mapcount)) 542 __inc_page_state(nr_mapped); 543} 544 545/** 546 * page_remove_rmap - take down pte mapping from a page 547 * @page: page to remove mapping from 548 * 549 * The caller needs to hold the pte lock. 550 */ 551void page_remove_rmap(struct page *page) 552{ 553 if (atomic_add_negative(-1, &page->_mapcount)) { 554 if (page_mapcount(page) < 0) { 555 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); 556 printk (KERN_EMERG " page->flags = %lx\n", page->flags); 557 printk (KERN_EMERG " page->count = %x\n", page_count(page)); 558 printk (KERN_EMERG " page->mapping = %p\n", page->mapping); 559 } 560 561 BUG_ON(page_mapcount(page) < 0); 562 /* 563 * It would be tidy to reset the PageAnon mapping here, 564 * but that might overwrite a racing page_add_anon_rmap 565 * which increments mapcount after us but sets mapping 566 * before us: so leave the reset to free_hot_cold_page, 567 * and remember that it's only reliable while mapped. 568 * Leaving it set also helps swapoff to reinstate ptes 569 * faster for those pages still in swapcache. 570 */ 571 if (page_test_and_clear_dirty(page)) 572 set_page_dirty(page); 573 __dec_page_state(nr_mapped); 574 } 575} 576 577/* 578 * Subfunctions of try_to_unmap: try_to_unmap_one called 579 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 580 */ 581static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 582 int ignore_refs) 583{ 584 struct mm_struct *mm = vma->vm_mm; 585 unsigned long address; 586 pte_t *pte; 587 pte_t pteval; 588 spinlock_t *ptl; 589 int ret = SWAP_AGAIN; 590 591 address = vma_address(page, vma); 592 if (address == -EFAULT) 593 goto out; 594 595 pte = page_check_address(page, mm, address, &ptl); 596 if (!pte) 597 goto out; 598 599 /* 600 * If the page is mlock()d, we cannot swap it out. 601 * If it's recently referenced (perhaps page_referenced 602 * skipped over this mm) then we should reactivate it. 603 */ 604 if ((vma->vm_flags & VM_LOCKED) || 605 (ptep_clear_flush_young(vma, address, pte) 606 && !ignore_refs)) { 607 ret = SWAP_FAIL; 608 goto out_unmap; 609 } 610 611 /* Nuke the page table entry. */ 612 flush_cache_page(vma, address, page_to_pfn(page)); 613 pteval = ptep_clear_flush(vma, address, pte); 614 615 /* Move the dirty bit to the physical page now the pte is gone. */ 616 if (pte_dirty(pteval)) 617 set_page_dirty(page); 618 619 /* Update high watermark before we lower rss */ 620 update_hiwater_rss(mm); 621 622 if (PageAnon(page)) { 623 swp_entry_t entry = { .val = page_private(page) }; 624 /* 625 * Store the swap location in the pte. 626 * See handle_pte_fault() ... 627 */ 628 BUG_ON(!PageSwapCache(page)); 629 swap_duplicate(entry); 630 if (list_empty(&mm->mmlist)) { 631 spin_lock(&mmlist_lock); 632 if (list_empty(&mm->mmlist)) 633 list_add(&mm->mmlist, &init_mm.mmlist); 634 spin_unlock(&mmlist_lock); 635 } 636 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 637 BUG_ON(pte_file(*pte)); 638 dec_mm_counter(mm, anon_rss); 639 } else 640 dec_mm_counter(mm, file_rss); 641 642 page_remove_rmap(page); 643 page_cache_release(page); 644 645out_unmap: 646 pte_unmap_unlock(pte, ptl); 647out: 648 return ret; 649} 650 651/* 652 * objrmap doesn't work for nonlinear VMAs because the assumption that 653 * offset-into-file correlates with offset-into-virtual-addresses does not hold. 654 * Consequently, given a particular page and its ->index, we cannot locate the 655 * ptes which are mapping that page without an exhaustive linear search. 656 * 657 * So what this code does is a mini "virtual scan" of each nonlinear VMA which 658 * maps the file to which the target page belongs. The ->vm_private_data field 659 * holds the current cursor into that scan. Successive searches will circulate 660 * around the vma's virtual address space. 661 * 662 * So as more replacement pressure is applied to the pages in a nonlinear VMA, 663 * more scanning pressure is placed against them as well. Eventually pages 664 * will become fully unmapped and are eligible for eviction. 665 * 666 * For very sparsely populated VMAs this is a little inefficient - chances are 667 * there there won't be many ptes located within the scan cluster. In this case 668 * maybe we could scan further - to the end of the pte page, perhaps. 669 */ 670#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) 671#define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) 672 673static void try_to_unmap_cluster(unsigned long cursor, 674 unsigned int *mapcount, struct vm_area_struct *vma) 675{ 676 struct mm_struct *mm = vma->vm_mm; 677 pgd_t *pgd; 678 pud_t *pud; 679 pmd_t *pmd; 680 pte_t *pte; 681 pte_t pteval; 682 spinlock_t *ptl; 683 struct page *page; 684 unsigned long address; 685 unsigned long end; 686 687 address = (vma->vm_start + cursor) & CLUSTER_MASK; 688 end = address + CLUSTER_SIZE; 689 if (address < vma->vm_start) 690 address = vma->vm_start; 691 if (end > vma->vm_end) 692 end = vma->vm_end; 693 694 pgd = pgd_offset(mm, address); 695 if (!pgd_present(*pgd)) 696 return; 697 698 pud = pud_offset(pgd, address); 699 if (!pud_present(*pud)) 700 return; 701 702 pmd = pmd_offset(pud, address); 703 if (!pmd_present(*pmd)) 704 return; 705 706 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 707 708 /* Update high watermark before we lower rss */ 709 update_hiwater_rss(mm); 710 711 for (; address < end; pte++, address += PAGE_SIZE) { 712 if (!pte_present(*pte)) 713 continue; 714 page = vm_normal_page(vma, address, *pte); 715 BUG_ON(!page || PageAnon(page)); 716 717 if (ptep_clear_flush_young(vma, address, pte)) 718 continue; 719 720 /* Nuke the page table entry. */ 721 flush_cache_page(vma, address, pte_pfn(*pte)); 722 pteval = ptep_clear_flush(vma, address, pte); 723 724 /* If nonlinear, store the file page offset in the pte. */ 725 if (page->index != linear_page_index(vma, address)) 726 set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); 727 728 /* Move the dirty bit to the physical page now the pte is gone. */ 729 if (pte_dirty(pteval)) 730 set_page_dirty(page); 731 732 page_remove_rmap(page); 733 page_cache_release(page); 734 dec_mm_counter(mm, file_rss); 735 (*mapcount)--; 736 } 737 pte_unmap_unlock(pte - 1, ptl); 738} 739 740static int try_to_unmap_anon(struct page *page, int ignore_refs) 741{ 742 struct anon_vma *anon_vma; 743 struct vm_area_struct *vma; 744 int ret = SWAP_AGAIN; 745 746 anon_vma = page_lock_anon_vma(page); 747 if (!anon_vma) 748 return ret; 749 750 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 751 ret = try_to_unmap_one(page, vma, ignore_refs); 752 if (ret == SWAP_FAIL || !page_mapped(page)) 753 break; 754 } 755 spin_unlock(&anon_vma->lock); 756 return ret; 757} 758 759/** 760 * try_to_unmap_file - unmap file page using the object-based rmap method 761 * @page: the page to unmap 762 * 763 * Find all the mappings of a page using the mapping pointer and the vma chains 764 * contained in the address_space struct it points to. 765 * 766 * This function is only called from try_to_unmap for object-based pages. 767 */ 768static int try_to_unmap_file(struct page *page, int ignore_refs) 769{ 770 struct address_space *mapping = page->mapping; 771 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 772 struct vm_area_struct *vma; 773 struct prio_tree_iter iter; 774 int ret = SWAP_AGAIN; 775 unsigned long cursor; 776 unsigned long max_nl_cursor = 0; 777 unsigned long max_nl_size = 0; 778 unsigned int mapcount; 779 780 spin_lock(&mapping->i_mmap_lock); 781 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 782 ret = try_to_unmap_one(page, vma, ignore_refs); 783 if (ret == SWAP_FAIL || !page_mapped(page)) 784 goto out; 785 } 786 787 if (list_empty(&mapping->i_mmap_nonlinear)) 788 goto out; 789 790 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 791 shared.vm_set.list) { 792 if (vma->vm_flags & VM_LOCKED) 793 continue; 794 cursor = (unsigned long) vma->vm_private_data; 795 if (cursor > max_nl_cursor) 796 max_nl_cursor = cursor; 797 cursor = vma->vm_end - vma->vm_start; 798 if (cursor > max_nl_size) 799 max_nl_size = cursor; 800 } 801 802 if (max_nl_size == 0) { /* any nonlinears locked or reserved */ 803 ret = SWAP_FAIL; 804 goto out; 805 } 806 807 /* 808 * We don't try to search for this page in the nonlinear vmas, 809 * and page_referenced wouldn't have found it anyway. Instead 810 * just walk the nonlinear vmas trying to age and unmap some. 811 * The mapcount of the page we came in with is irrelevant, 812 * but even so use it as a guide to how hard we should try? 813 */ 814 mapcount = page_mapcount(page); 815 if (!mapcount) 816 goto out; 817 cond_resched_lock(&mapping->i_mmap_lock); 818 819 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; 820 if (max_nl_cursor == 0) 821 max_nl_cursor = CLUSTER_SIZE; 822 823 do { 824 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 825 shared.vm_set.list) { 826 if (vma->vm_flags & VM_LOCKED) 827 continue; 828 cursor = (unsigned long) vma->vm_private_data; 829 while ( cursor < max_nl_cursor && 830 cursor < vma->vm_end - vma->vm_start) { 831 try_to_unmap_cluster(cursor, &mapcount, vma); 832 cursor += CLUSTER_SIZE; 833 vma->vm_private_data = (void *) cursor; 834 if ((int)mapcount <= 0) 835 goto out; 836 } 837 vma->vm_private_data = (void *) max_nl_cursor; 838 } 839 cond_resched_lock(&mapping->i_mmap_lock); 840 max_nl_cursor += CLUSTER_SIZE; 841 } while (max_nl_cursor <= max_nl_size); 842 843 /* 844 * Don't loop forever (perhaps all the remaining pages are 845 * in locked vmas). Reset cursor on all unreserved nonlinear 846 * vmas, now forgetting on which ones it had fallen behind. 847 */ 848 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) 849 vma->vm_private_data = NULL; 850out: 851 spin_unlock(&mapping->i_mmap_lock); 852 return ret; 853} 854 855/** 856 * try_to_unmap - try to remove all page table mappings to a page 857 * @page: the page to get unmapped 858 * 859 * Tries to remove all the page table entries which are mapping this 860 * page, used in the pageout path. Caller must hold the page lock. 861 * Return values are: 862 * 863 * SWAP_SUCCESS - we succeeded in removing all mappings 864 * SWAP_AGAIN - we missed a mapping, try again later 865 * SWAP_FAIL - the page is unswappable 866 */ 867int try_to_unmap(struct page *page, int ignore_refs) 868{ 869 int ret; 870 871 BUG_ON(!PageLocked(page)); 872 873 if (PageAnon(page)) 874 ret = try_to_unmap_anon(page, ignore_refs); 875 else 876 ret = try_to_unmap_file(page, ignore_refs); 877 878 if (!page_mapped(page)) 879 ret = SWAP_SUCCESS; 880 return ret; 881} 882 883