rmap.c revision a4b526b3ba6353cd89a38e41da48ed83b0ead16f
1/* 2 * mm/rmap.c - physical to virtual reverse mappings 3 * 4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br> 5 * Released under the General Public License (GPL). 6 * 7 * Simple, low overhead reverse mapping scheme. 8 * Please try to keep this thing as modular as possible. 9 * 10 * Provides methods for unmapping each kind of mapped page: 11 * the anon methods track anonymous pages, and 12 * the file methods track pages belonging to an inode. 13 * 14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001 15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 17 * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004 18 */ 19 20/* 21 * Lock ordering in mm: 22 * 23 * inode->i_mutex (while writing or truncating, not reading or faulting) 24 * inode->i_alloc_sem (vmtruncate_range) 25 * mm->mmap_sem 26 * page->flags PG_locked (lock_page) 27 * mapping->i_mmap_lock 28 * anon_vma->lock 29 * mm->page_table_lock or pte_lock 30 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 31 * swap_lock (in swap_duplicate, swap_info_get) 32 * mmlist_lock (in mmput, drain_mmlist and others) 33 * mapping->private_lock (in __set_page_dirty_buffers) 34 * inode_lock (in set_page_dirty's __mark_inode_dirty) 35 * sb_lock (within inode_lock in fs/fs-writeback.c) 36 * mapping->tree_lock (widely used, in set_page_dirty, 37 * in arch-dependent flush_dcache_mmap_lock, 38 * within inode_lock in __sync_single_inode) 39 */ 40 41#include <linux/mm.h> 42#include <linux/pagemap.h> 43#include <linux/swap.h> 44#include <linux/swapops.h> 45#include <linux/slab.h> 46#include <linux/init.h> 47#include <linux/rmap.h> 48#include <linux/rcupdate.h> 49#include <linux/module.h> 50#include <linux/kallsyms.h> 51#include <linux/memcontrol.h> 52#include <linux/mmu_notifier.h> 53 54#include <asm/tlbflush.h> 55 56struct kmem_cache *anon_vma_cachep; 57 58/* This must be called under the mmap_sem. */ 59int anon_vma_prepare(struct vm_area_struct *vma) 60{ 61 struct anon_vma *anon_vma = vma->anon_vma; 62 63 might_sleep(); 64 if (unlikely(!anon_vma)) { 65 struct mm_struct *mm = vma->vm_mm; 66 struct anon_vma *allocated, *locked; 67 68 anon_vma = find_mergeable_anon_vma(vma); 69 if (anon_vma) { 70 allocated = NULL; 71 locked = anon_vma; 72 spin_lock(&locked->lock); 73 } else { 74 anon_vma = anon_vma_alloc(); 75 if (unlikely(!anon_vma)) 76 return -ENOMEM; 77 allocated = anon_vma; 78 locked = NULL; 79 } 80 81 /* page_table_lock to protect against threads */ 82 spin_lock(&mm->page_table_lock); 83 if (likely(!vma->anon_vma)) { 84 vma->anon_vma = anon_vma; 85 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 86 allocated = NULL; 87 } 88 spin_unlock(&mm->page_table_lock); 89 90 if (locked) 91 spin_unlock(&locked->lock); 92 if (unlikely(allocated)) 93 anon_vma_free(allocated); 94 } 95 return 0; 96} 97 98void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) 99{ 100 BUG_ON(vma->anon_vma != next->anon_vma); 101 list_del(&next->anon_vma_node); 102} 103 104void __anon_vma_link(struct vm_area_struct *vma) 105{ 106 struct anon_vma *anon_vma = vma->anon_vma; 107 108 if (anon_vma) 109 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 110} 111 112void anon_vma_link(struct vm_area_struct *vma) 113{ 114 struct anon_vma *anon_vma = vma->anon_vma; 115 116 if (anon_vma) { 117 spin_lock(&anon_vma->lock); 118 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 119 spin_unlock(&anon_vma->lock); 120 } 121} 122 123void anon_vma_unlink(struct vm_area_struct *vma) 124{ 125 struct anon_vma *anon_vma = vma->anon_vma; 126 int empty; 127 128 if (!anon_vma) 129 return; 130 131 spin_lock(&anon_vma->lock); 132 list_del(&vma->anon_vma_node); 133 134 /* We must garbage collect the anon_vma if it's empty */ 135 empty = list_empty(&anon_vma->head); 136 spin_unlock(&anon_vma->lock); 137 138 if (empty) 139 anon_vma_free(anon_vma); 140} 141 142static void anon_vma_ctor(void *data) 143{ 144 struct anon_vma *anon_vma = data; 145 146 spin_lock_init(&anon_vma->lock); 147 INIT_LIST_HEAD(&anon_vma->head); 148} 149 150void __init anon_vma_init(void) 151{ 152 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 153 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); 154} 155 156/* 157 * Getting a lock on a stable anon_vma from a page off the LRU is 158 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 159 */ 160static struct anon_vma *page_lock_anon_vma(struct page *page) 161{ 162 struct anon_vma *anon_vma; 163 unsigned long anon_mapping; 164 165 rcu_read_lock(); 166 anon_mapping = (unsigned long) page->mapping; 167 if (!(anon_mapping & PAGE_MAPPING_ANON)) 168 goto out; 169 if (!page_mapped(page)) 170 goto out; 171 172 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 173 spin_lock(&anon_vma->lock); 174 return anon_vma; 175out: 176 rcu_read_unlock(); 177 return NULL; 178} 179 180static void page_unlock_anon_vma(struct anon_vma *anon_vma) 181{ 182 spin_unlock(&anon_vma->lock); 183 rcu_read_unlock(); 184} 185 186/* 187 * At what user virtual address is page expected in @vma? 188 * Returns virtual address or -EFAULT if page's index/offset is not 189 * within the range mapped the @vma. 190 */ 191static inline unsigned long 192vma_address(struct page *page, struct vm_area_struct *vma) 193{ 194 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 195 unsigned long address; 196 197 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 198 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 199 /* page should be within @vma mapping range */ 200 return -EFAULT; 201 } 202 return address; 203} 204 205/* 206 * At what user virtual address is page expected in vma? checking that the 207 * page matches the vma: currently only used on anon pages, by unuse_vma; 208 */ 209unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 210{ 211 if (PageAnon(page)) { 212 if ((void *)vma->anon_vma != 213 (void *)page->mapping - PAGE_MAPPING_ANON) 214 return -EFAULT; 215 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 216 if (!vma->vm_file || 217 vma->vm_file->f_mapping != page->mapping) 218 return -EFAULT; 219 } else 220 return -EFAULT; 221 return vma_address(page, vma); 222} 223 224/* 225 * Check that @page is mapped at @address into @mm. 226 * 227 * On success returns with pte mapped and locked. 228 */ 229pte_t *page_check_address(struct page *page, struct mm_struct *mm, 230 unsigned long address, spinlock_t **ptlp) 231{ 232 pgd_t *pgd; 233 pud_t *pud; 234 pmd_t *pmd; 235 pte_t *pte; 236 spinlock_t *ptl; 237 238 pgd = pgd_offset(mm, address); 239 if (!pgd_present(*pgd)) 240 return NULL; 241 242 pud = pud_offset(pgd, address); 243 if (!pud_present(*pud)) 244 return NULL; 245 246 pmd = pmd_offset(pud, address); 247 if (!pmd_present(*pmd)) 248 return NULL; 249 250 pte = pte_offset_map(pmd, address); 251 /* Make a quick check before getting the lock */ 252 if (!pte_present(*pte)) { 253 pte_unmap(pte); 254 return NULL; 255 } 256 257 ptl = pte_lockptr(mm, pmd); 258 spin_lock(ptl); 259 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { 260 *ptlp = ptl; 261 return pte; 262 } 263 pte_unmap_unlock(pte, ptl); 264 return NULL; 265} 266 267/* 268 * Subfunctions of page_referenced: page_referenced_one called 269 * repeatedly from either page_referenced_anon or page_referenced_file. 270 */ 271static int page_referenced_one(struct page *page, 272 struct vm_area_struct *vma, unsigned int *mapcount) 273{ 274 struct mm_struct *mm = vma->vm_mm; 275 unsigned long address; 276 pte_t *pte; 277 spinlock_t *ptl; 278 int referenced = 0; 279 280 address = vma_address(page, vma); 281 if (address == -EFAULT) 282 goto out; 283 284 pte = page_check_address(page, mm, address, &ptl); 285 if (!pte) 286 goto out; 287 288 if (vma->vm_flags & VM_LOCKED) { 289 referenced++; 290 *mapcount = 1; /* break early from loop */ 291 } else if (ptep_clear_flush_young_notify(vma, address, pte)) 292 referenced++; 293 294 /* Pretend the page is referenced if the task has the 295 swap token and is in the middle of a page fault. */ 296 if (mm != current->mm && has_swap_token(mm) && 297 rwsem_is_locked(&mm->mmap_sem)) 298 referenced++; 299 300 (*mapcount)--; 301 pte_unmap_unlock(pte, ptl); 302out: 303 return referenced; 304} 305 306static int page_referenced_anon(struct page *page, 307 struct mem_cgroup *mem_cont) 308{ 309 unsigned int mapcount; 310 struct anon_vma *anon_vma; 311 struct vm_area_struct *vma; 312 int referenced = 0; 313 314 anon_vma = page_lock_anon_vma(page); 315 if (!anon_vma) 316 return referenced; 317 318 mapcount = page_mapcount(page); 319 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 320 /* 321 * If we are reclaiming on behalf of a cgroup, skip 322 * counting on behalf of references from different 323 * cgroups 324 */ 325 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 326 continue; 327 referenced += page_referenced_one(page, vma, &mapcount); 328 if (!mapcount) 329 break; 330 } 331 332 page_unlock_anon_vma(anon_vma); 333 return referenced; 334} 335 336/** 337 * page_referenced_file - referenced check for object-based rmap 338 * @page: the page we're checking references on. 339 * @mem_cont: target memory controller 340 * 341 * For an object-based mapped page, find all the places it is mapped and 342 * check/clear the referenced flag. This is done by following the page->mapping 343 * pointer, then walking the chain of vmas it holds. It returns the number 344 * of references it found. 345 * 346 * This function is only called from page_referenced for object-based pages. 347 */ 348static int page_referenced_file(struct page *page, 349 struct mem_cgroup *mem_cont) 350{ 351 unsigned int mapcount; 352 struct address_space *mapping = page->mapping; 353 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 354 struct vm_area_struct *vma; 355 struct prio_tree_iter iter; 356 int referenced = 0; 357 358 /* 359 * The caller's checks on page->mapping and !PageAnon have made 360 * sure that this is a file page: the check for page->mapping 361 * excludes the case just before it gets set on an anon page. 362 */ 363 BUG_ON(PageAnon(page)); 364 365 /* 366 * The page lock not only makes sure that page->mapping cannot 367 * suddenly be NULLified by truncation, it makes sure that the 368 * structure at mapping cannot be freed and reused yet, 369 * so we can safely take mapping->i_mmap_lock. 370 */ 371 BUG_ON(!PageLocked(page)); 372 373 spin_lock(&mapping->i_mmap_lock); 374 375 /* 376 * i_mmap_lock does not stabilize mapcount at all, but mapcount 377 * is more likely to be accurate if we note it after spinning. 378 */ 379 mapcount = page_mapcount(page); 380 381 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 382 /* 383 * If we are reclaiming on behalf of a cgroup, skip 384 * counting on behalf of references from different 385 * cgroups 386 */ 387 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 388 continue; 389 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) 390 == (VM_LOCKED|VM_MAYSHARE)) { 391 referenced++; 392 break; 393 } 394 referenced += page_referenced_one(page, vma, &mapcount); 395 if (!mapcount) 396 break; 397 } 398 399 spin_unlock(&mapping->i_mmap_lock); 400 return referenced; 401} 402 403/** 404 * page_referenced - test if the page was referenced 405 * @page: the page to test 406 * @is_locked: caller holds lock on the page 407 * @mem_cont: target memory controller 408 * 409 * Quick test_and_clear_referenced for all mappings to a page, 410 * returns the number of ptes which referenced the page. 411 */ 412int page_referenced(struct page *page, int is_locked, 413 struct mem_cgroup *mem_cont) 414{ 415 int referenced = 0; 416 417 if (TestClearPageReferenced(page)) 418 referenced++; 419 420 if (page_mapped(page) && page->mapping) { 421 if (PageAnon(page)) 422 referenced += page_referenced_anon(page, mem_cont); 423 else if (is_locked) 424 referenced += page_referenced_file(page, mem_cont); 425 else if (TestSetPageLocked(page)) 426 referenced++; 427 else { 428 if (page->mapping) 429 referenced += 430 page_referenced_file(page, mem_cont); 431 unlock_page(page); 432 } 433 } 434 435 if (page_test_and_clear_young(page)) 436 referenced++; 437 438 return referenced; 439} 440 441static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) 442{ 443 struct mm_struct *mm = vma->vm_mm; 444 unsigned long address; 445 pte_t *pte; 446 spinlock_t *ptl; 447 int ret = 0; 448 449 address = vma_address(page, vma); 450 if (address == -EFAULT) 451 goto out; 452 453 pte = page_check_address(page, mm, address, &ptl); 454 if (!pte) 455 goto out; 456 457 if (pte_dirty(*pte) || pte_write(*pte)) { 458 pte_t entry; 459 460 flush_cache_page(vma, address, pte_pfn(*pte)); 461 entry = ptep_clear_flush_notify(vma, address, pte); 462 entry = pte_wrprotect(entry); 463 entry = pte_mkclean(entry); 464 set_pte_at(mm, address, pte, entry); 465 ret = 1; 466 } 467 468 pte_unmap_unlock(pte, ptl); 469out: 470 return ret; 471} 472 473static int page_mkclean_file(struct address_space *mapping, struct page *page) 474{ 475 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 476 struct vm_area_struct *vma; 477 struct prio_tree_iter iter; 478 int ret = 0; 479 480 BUG_ON(PageAnon(page)); 481 482 spin_lock(&mapping->i_mmap_lock); 483 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 484 if (vma->vm_flags & VM_SHARED) 485 ret += page_mkclean_one(page, vma); 486 } 487 spin_unlock(&mapping->i_mmap_lock); 488 return ret; 489} 490 491int page_mkclean(struct page *page) 492{ 493 int ret = 0; 494 495 BUG_ON(!PageLocked(page)); 496 497 if (page_mapped(page)) { 498 struct address_space *mapping = page_mapping(page); 499 if (mapping) { 500 ret = page_mkclean_file(mapping, page); 501 if (page_test_dirty(page)) { 502 page_clear_dirty(page); 503 ret = 1; 504 } 505 } 506 } 507 508 return ret; 509} 510EXPORT_SYMBOL_GPL(page_mkclean); 511 512/** 513 * __page_set_anon_rmap - setup new anonymous rmap 514 * @page: the page to add the mapping to 515 * @vma: the vm area in which the mapping is added 516 * @address: the user virtual address mapped 517 */ 518static void __page_set_anon_rmap(struct page *page, 519 struct vm_area_struct *vma, unsigned long address) 520{ 521 struct anon_vma *anon_vma = vma->anon_vma; 522 523 BUG_ON(!anon_vma); 524 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 525 page->mapping = (struct address_space *) anon_vma; 526 527 page->index = linear_page_index(vma, address); 528 529 /* 530 * nr_mapped state can be updated without turning off 531 * interrupts because it is not modified via interrupt. 532 */ 533 __inc_zone_page_state(page, NR_ANON_PAGES); 534} 535 536/** 537 * __page_check_anon_rmap - sanity check anonymous rmap addition 538 * @page: the page to add the mapping to 539 * @vma: the vm area in which the mapping is added 540 * @address: the user virtual address mapped 541 */ 542static void __page_check_anon_rmap(struct page *page, 543 struct vm_area_struct *vma, unsigned long address) 544{ 545#ifdef CONFIG_DEBUG_VM 546 /* 547 * The page's anon-rmap details (mapping and index) are guaranteed to 548 * be set up correctly at this point. 549 * 550 * We have exclusion against page_add_anon_rmap because the caller 551 * always holds the page locked, except if called from page_dup_rmap, 552 * in which case the page is already known to be setup. 553 * 554 * We have exclusion against page_add_new_anon_rmap because those pages 555 * are initially only visible via the pagetables, and the pte is locked 556 * over the call to page_add_new_anon_rmap. 557 */ 558 struct anon_vma *anon_vma = vma->anon_vma; 559 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 560 BUG_ON(page->mapping != (struct address_space *)anon_vma); 561 BUG_ON(page->index != linear_page_index(vma, address)); 562#endif 563} 564 565/** 566 * page_add_anon_rmap - add pte mapping to an anonymous page 567 * @page: the page to add the mapping to 568 * @vma: the vm area in which the mapping is added 569 * @address: the user virtual address mapped 570 * 571 * The caller needs to hold the pte lock and the page must be locked. 572 */ 573void page_add_anon_rmap(struct page *page, 574 struct vm_area_struct *vma, unsigned long address) 575{ 576 VM_BUG_ON(!PageLocked(page)); 577 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 578 if (atomic_inc_and_test(&page->_mapcount)) 579 __page_set_anon_rmap(page, vma, address); 580 else 581 __page_check_anon_rmap(page, vma, address); 582} 583 584/** 585 * page_add_new_anon_rmap - add pte mapping to a new anonymous page 586 * @page: the page to add the mapping to 587 * @vma: the vm area in which the mapping is added 588 * @address: the user virtual address mapped 589 * 590 * Same as page_add_anon_rmap but must only be called on *new* pages. 591 * This means the inc-and-test can be bypassed. 592 * Page does not have to be locked. 593 */ 594void page_add_new_anon_rmap(struct page *page, 595 struct vm_area_struct *vma, unsigned long address) 596{ 597 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 598 atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ 599 __page_set_anon_rmap(page, vma, address); 600} 601 602/** 603 * page_add_file_rmap - add pte mapping to a file page 604 * @page: the page to add the mapping to 605 * 606 * The caller needs to hold the pte lock. 607 */ 608void page_add_file_rmap(struct page *page) 609{ 610 if (atomic_inc_and_test(&page->_mapcount)) 611 __inc_zone_page_state(page, NR_FILE_MAPPED); 612} 613 614#ifdef CONFIG_DEBUG_VM 615/** 616 * page_dup_rmap - duplicate pte mapping to a page 617 * @page: the page to add the mapping to 618 * @vma: the vm area being duplicated 619 * @address: the user virtual address mapped 620 * 621 * For copy_page_range only: minimal extract from page_add_file_rmap / 622 * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's 623 * quicker. 624 * 625 * The caller needs to hold the pte lock. 626 */ 627void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) 628{ 629 BUG_ON(page_mapcount(page) == 0); 630 if (PageAnon(page)) 631 __page_check_anon_rmap(page, vma, address); 632 atomic_inc(&page->_mapcount); 633} 634#endif 635 636/** 637 * page_remove_rmap - take down pte mapping from a page 638 * @page: page to remove mapping from 639 * @vma: the vm area in which the mapping is removed 640 * 641 * The caller needs to hold the pte lock. 642 */ 643void page_remove_rmap(struct page *page, struct vm_area_struct *vma) 644{ 645 if (atomic_add_negative(-1, &page->_mapcount)) { 646 if (unlikely(page_mapcount(page) < 0)) { 647 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); 648 printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page)); 649 printk (KERN_EMERG " page->flags = %lx\n", page->flags); 650 printk (KERN_EMERG " page->count = %x\n", page_count(page)); 651 printk (KERN_EMERG " page->mapping = %p\n", page->mapping); 652 print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); 653 if (vma->vm_ops) { 654 print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault); 655 } 656 if (vma->vm_file && vma->vm_file->f_op) 657 print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap); 658 BUG(); 659 } 660 661 /* 662 * It would be tidy to reset the PageAnon mapping here, 663 * but that might overwrite a racing page_add_anon_rmap 664 * which increments mapcount after us but sets mapping 665 * before us: so leave the reset to free_hot_cold_page, 666 * and remember that it's only reliable while mapped. 667 * Leaving it set also helps swapoff to reinstate ptes 668 * faster for those pages still in swapcache. 669 */ 670 if ((!PageAnon(page) || PageSwapCache(page)) && 671 page_test_dirty(page)) { 672 page_clear_dirty(page); 673 set_page_dirty(page); 674 } 675 mem_cgroup_uncharge_page(page); 676 677 __dec_zone_page_state(page, 678 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); 679 } 680} 681 682/* 683 * Subfunctions of try_to_unmap: try_to_unmap_one called 684 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 685 */ 686static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 687 int migration) 688{ 689 struct mm_struct *mm = vma->vm_mm; 690 unsigned long address; 691 pte_t *pte; 692 pte_t pteval; 693 spinlock_t *ptl; 694 int ret = SWAP_AGAIN; 695 696 address = vma_address(page, vma); 697 if (address == -EFAULT) 698 goto out; 699 700 pte = page_check_address(page, mm, address, &ptl); 701 if (!pte) 702 goto out; 703 704 /* 705 * If the page is mlock()d, we cannot swap it out. 706 * If it's recently referenced (perhaps page_referenced 707 * skipped over this mm) then we should reactivate it. 708 */ 709 if (!migration && ((vma->vm_flags & VM_LOCKED) || 710 (ptep_clear_flush_young_notify(vma, address, pte)))) { 711 ret = SWAP_FAIL; 712 goto out_unmap; 713 } 714 715 /* Nuke the page table entry. */ 716 flush_cache_page(vma, address, page_to_pfn(page)); 717 pteval = ptep_clear_flush_notify(vma, address, pte); 718 719 /* Move the dirty bit to the physical page now the pte is gone. */ 720 if (pte_dirty(pteval)) 721 set_page_dirty(page); 722 723 /* Update high watermark before we lower rss */ 724 update_hiwater_rss(mm); 725 726 if (PageAnon(page)) { 727 swp_entry_t entry = { .val = page_private(page) }; 728 729 if (PageSwapCache(page)) { 730 /* 731 * Store the swap location in the pte. 732 * See handle_pte_fault() ... 733 */ 734 swap_duplicate(entry); 735 if (list_empty(&mm->mmlist)) { 736 spin_lock(&mmlist_lock); 737 if (list_empty(&mm->mmlist)) 738 list_add(&mm->mmlist, &init_mm.mmlist); 739 spin_unlock(&mmlist_lock); 740 } 741 dec_mm_counter(mm, anon_rss); 742#ifdef CONFIG_MIGRATION 743 } else { 744 /* 745 * Store the pfn of the page in a special migration 746 * pte. do_swap_page() will wait until the migration 747 * pte is removed and then restart fault handling. 748 */ 749 BUG_ON(!migration); 750 entry = make_migration_entry(page, pte_write(pteval)); 751#endif 752 } 753 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 754 BUG_ON(pte_file(*pte)); 755 } else 756#ifdef CONFIG_MIGRATION 757 if (migration) { 758 /* Establish migration entry for a file page */ 759 swp_entry_t entry; 760 entry = make_migration_entry(page, pte_write(pteval)); 761 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 762 } else 763#endif 764 dec_mm_counter(mm, file_rss); 765 766 767 page_remove_rmap(page, vma); 768 page_cache_release(page); 769 770out_unmap: 771 pte_unmap_unlock(pte, ptl); 772out: 773 return ret; 774} 775 776/* 777 * objrmap doesn't work for nonlinear VMAs because the assumption that 778 * offset-into-file correlates with offset-into-virtual-addresses does not hold. 779 * Consequently, given a particular page and its ->index, we cannot locate the 780 * ptes which are mapping that page without an exhaustive linear search. 781 * 782 * So what this code does is a mini "virtual scan" of each nonlinear VMA which 783 * maps the file to which the target page belongs. The ->vm_private_data field 784 * holds the current cursor into that scan. Successive searches will circulate 785 * around the vma's virtual address space. 786 * 787 * So as more replacement pressure is applied to the pages in a nonlinear VMA, 788 * more scanning pressure is placed against them as well. Eventually pages 789 * will become fully unmapped and are eligible for eviction. 790 * 791 * For very sparsely populated VMAs this is a little inefficient - chances are 792 * there there won't be many ptes located within the scan cluster. In this case 793 * maybe we could scan further - to the end of the pte page, perhaps. 794 */ 795#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) 796#define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) 797 798static void try_to_unmap_cluster(unsigned long cursor, 799 unsigned int *mapcount, struct vm_area_struct *vma) 800{ 801 struct mm_struct *mm = vma->vm_mm; 802 pgd_t *pgd; 803 pud_t *pud; 804 pmd_t *pmd; 805 pte_t *pte; 806 pte_t pteval; 807 spinlock_t *ptl; 808 struct page *page; 809 unsigned long address; 810 unsigned long end; 811 812 address = (vma->vm_start + cursor) & CLUSTER_MASK; 813 end = address + CLUSTER_SIZE; 814 if (address < vma->vm_start) 815 address = vma->vm_start; 816 if (end > vma->vm_end) 817 end = vma->vm_end; 818 819 pgd = pgd_offset(mm, address); 820 if (!pgd_present(*pgd)) 821 return; 822 823 pud = pud_offset(pgd, address); 824 if (!pud_present(*pud)) 825 return; 826 827 pmd = pmd_offset(pud, address); 828 if (!pmd_present(*pmd)) 829 return; 830 831 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 832 833 /* Update high watermark before we lower rss */ 834 update_hiwater_rss(mm); 835 836 for (; address < end; pte++, address += PAGE_SIZE) { 837 if (!pte_present(*pte)) 838 continue; 839 page = vm_normal_page(vma, address, *pte); 840 BUG_ON(!page || PageAnon(page)); 841 842 if (ptep_clear_flush_young_notify(vma, address, pte)) 843 continue; 844 845 /* Nuke the page table entry. */ 846 flush_cache_page(vma, address, pte_pfn(*pte)); 847 pteval = ptep_clear_flush_notify(vma, address, pte); 848 849 /* If nonlinear, store the file page offset in the pte. */ 850 if (page->index != linear_page_index(vma, address)) 851 set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); 852 853 /* Move the dirty bit to the physical page now the pte is gone. */ 854 if (pte_dirty(pteval)) 855 set_page_dirty(page); 856 857 page_remove_rmap(page, vma); 858 page_cache_release(page); 859 dec_mm_counter(mm, file_rss); 860 (*mapcount)--; 861 } 862 pte_unmap_unlock(pte - 1, ptl); 863} 864 865static int try_to_unmap_anon(struct page *page, int migration) 866{ 867 struct anon_vma *anon_vma; 868 struct vm_area_struct *vma; 869 int ret = SWAP_AGAIN; 870 871 anon_vma = page_lock_anon_vma(page); 872 if (!anon_vma) 873 return ret; 874 875 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 876 ret = try_to_unmap_one(page, vma, migration); 877 if (ret == SWAP_FAIL || !page_mapped(page)) 878 break; 879 } 880 881 page_unlock_anon_vma(anon_vma); 882 return ret; 883} 884 885/** 886 * try_to_unmap_file - unmap file page using the object-based rmap method 887 * @page: the page to unmap 888 * @migration: migration flag 889 * 890 * Find all the mappings of a page using the mapping pointer and the vma chains 891 * contained in the address_space struct it points to. 892 * 893 * This function is only called from try_to_unmap for object-based pages. 894 */ 895static int try_to_unmap_file(struct page *page, int migration) 896{ 897 struct address_space *mapping = page->mapping; 898 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 899 struct vm_area_struct *vma; 900 struct prio_tree_iter iter; 901 int ret = SWAP_AGAIN; 902 unsigned long cursor; 903 unsigned long max_nl_cursor = 0; 904 unsigned long max_nl_size = 0; 905 unsigned int mapcount; 906 907 spin_lock(&mapping->i_mmap_lock); 908 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 909 ret = try_to_unmap_one(page, vma, migration); 910 if (ret == SWAP_FAIL || !page_mapped(page)) 911 goto out; 912 } 913 914 if (list_empty(&mapping->i_mmap_nonlinear)) 915 goto out; 916 917 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 918 shared.vm_set.list) { 919 if ((vma->vm_flags & VM_LOCKED) && !migration) 920 continue; 921 cursor = (unsigned long) vma->vm_private_data; 922 if (cursor > max_nl_cursor) 923 max_nl_cursor = cursor; 924 cursor = vma->vm_end - vma->vm_start; 925 if (cursor > max_nl_size) 926 max_nl_size = cursor; 927 } 928 929 if (max_nl_size == 0) { /* any nonlinears locked or reserved */ 930 ret = SWAP_FAIL; 931 goto out; 932 } 933 934 /* 935 * We don't try to search for this page in the nonlinear vmas, 936 * and page_referenced wouldn't have found it anyway. Instead 937 * just walk the nonlinear vmas trying to age and unmap some. 938 * The mapcount of the page we came in with is irrelevant, 939 * but even so use it as a guide to how hard we should try? 940 */ 941 mapcount = page_mapcount(page); 942 if (!mapcount) 943 goto out; 944 cond_resched_lock(&mapping->i_mmap_lock); 945 946 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; 947 if (max_nl_cursor == 0) 948 max_nl_cursor = CLUSTER_SIZE; 949 950 do { 951 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 952 shared.vm_set.list) { 953 if ((vma->vm_flags & VM_LOCKED) && !migration) 954 continue; 955 cursor = (unsigned long) vma->vm_private_data; 956 while ( cursor < max_nl_cursor && 957 cursor < vma->vm_end - vma->vm_start) { 958 try_to_unmap_cluster(cursor, &mapcount, vma); 959 cursor += CLUSTER_SIZE; 960 vma->vm_private_data = (void *) cursor; 961 if ((int)mapcount <= 0) 962 goto out; 963 } 964 vma->vm_private_data = (void *) max_nl_cursor; 965 } 966 cond_resched_lock(&mapping->i_mmap_lock); 967 max_nl_cursor += CLUSTER_SIZE; 968 } while (max_nl_cursor <= max_nl_size); 969 970 /* 971 * Don't loop forever (perhaps all the remaining pages are 972 * in locked vmas). Reset cursor on all unreserved nonlinear 973 * vmas, now forgetting on which ones it had fallen behind. 974 */ 975 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) 976 vma->vm_private_data = NULL; 977out: 978 spin_unlock(&mapping->i_mmap_lock); 979 return ret; 980} 981 982/** 983 * try_to_unmap - try to remove all page table mappings to a page 984 * @page: the page to get unmapped 985 * @migration: migration flag 986 * 987 * Tries to remove all the page table entries which are mapping this 988 * page, used in the pageout path. Caller must hold the page lock. 989 * Return values are: 990 * 991 * SWAP_SUCCESS - we succeeded in removing all mappings 992 * SWAP_AGAIN - we missed a mapping, try again later 993 * SWAP_FAIL - the page is unswappable 994 */ 995int try_to_unmap(struct page *page, int migration) 996{ 997 int ret; 998 999 BUG_ON(!PageLocked(page)); 1000 1001 if (PageAnon(page)) 1002 ret = try_to_unmap_anon(page, migration); 1003 else 1004 ret = try_to_unmap_file(page, migration); 1005 1006 if (!page_mapped(page)) 1007 ret = SWAP_SUCCESS; 1008 return ret; 1009} 1010 1011