rmap.c revision 0fe6e20b9c4c53b3e97096ee73a0857f60aad43f
1/* 2 * mm/rmap.c - physical to virtual reverse mappings 3 * 4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br> 5 * Released under the General Public License (GPL). 6 * 7 * Simple, low overhead reverse mapping scheme. 8 * Please try to keep this thing as modular as possible. 9 * 10 * Provides methods for unmapping each kind of mapped page: 11 * the anon methods track anonymous pages, and 12 * the file methods track pages belonging to an inode. 13 * 14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001 15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 17 * Contributions by Hugh Dickins 2003, 2004 18 */ 19 20/* 21 * Lock ordering in mm: 22 * 23 * inode->i_mutex (while writing or truncating, not reading or faulting) 24 * inode->i_alloc_sem (vmtruncate_range) 25 * mm->mmap_sem 26 * page->flags PG_locked (lock_page) 27 * mapping->i_mmap_lock 28 * anon_vma->lock 29 * mm->page_table_lock or pte_lock 30 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 31 * swap_lock (in swap_duplicate, swap_info_get) 32 * mmlist_lock (in mmput, drain_mmlist and others) 33 * mapping->private_lock (in __set_page_dirty_buffers) 34 * inode_lock (in set_page_dirty's __mark_inode_dirty) 35 * sb_lock (within inode_lock in fs/fs-writeback.c) 36 * mapping->tree_lock (widely used, in set_page_dirty, 37 * in arch-dependent flush_dcache_mmap_lock, 38 * within inode_lock in __sync_single_inode) 39 * 40 * (code doesn't rely on that order so it could be switched around) 41 * ->tasklist_lock 42 * anon_vma->lock (memory_failure, collect_procs_anon) 43 * pte map lock 44 */ 45 46#include <linux/mm.h> 47#include <linux/pagemap.h> 48#include <linux/swap.h> 49#include <linux/swapops.h> 50#include <linux/slab.h> 51#include <linux/init.h> 52#include <linux/ksm.h> 53#include <linux/rmap.h> 54#include <linux/rcupdate.h> 55#include <linux/module.h> 56#include <linux/memcontrol.h> 57#include <linux/mmu_notifier.h> 58#include <linux/migrate.h> 59#include <linux/hugetlb.h> 60 61#include <asm/tlbflush.h> 62 63#include "internal.h" 64 65static struct kmem_cache *anon_vma_cachep; 66static struct kmem_cache *anon_vma_chain_cachep; 67 68static inline struct anon_vma *anon_vma_alloc(void) 69{ 70 return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); 71} 72 73void anon_vma_free(struct anon_vma *anon_vma) 74{ 75 kmem_cache_free(anon_vma_cachep, anon_vma); 76} 77 78static inline struct anon_vma_chain *anon_vma_chain_alloc(void) 79{ 80 return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); 81} 82 83void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) 84{ 85 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); 86} 87 88/** 89 * anon_vma_prepare - attach an anon_vma to a memory region 90 * @vma: the memory region in question 91 * 92 * This makes sure the memory mapping described by 'vma' has 93 * an 'anon_vma' attached to it, so that we can associate the 94 * anonymous pages mapped into it with that anon_vma. 95 * 96 * The common case will be that we already have one, but if 97 * if not we either need to find an adjacent mapping that we 98 * can re-use the anon_vma from (very common when the only 99 * reason for splitting a vma has been mprotect()), or we 100 * allocate a new one. 101 * 102 * Anon-vma allocations are very subtle, because we may have 103 * optimistically looked up an anon_vma in page_lock_anon_vma() 104 * and that may actually touch the spinlock even in the newly 105 * allocated vma (it depends on RCU to make sure that the 106 * anon_vma isn't actually destroyed). 107 * 108 * As a result, we need to do proper anon_vma locking even 109 * for the new allocation. At the same time, we do not want 110 * to do any locking for the common case of already having 111 * an anon_vma. 112 * 113 * This must be called with the mmap_sem held for reading. 114 */ 115int anon_vma_prepare(struct vm_area_struct *vma) 116{ 117 struct anon_vma *anon_vma = vma->anon_vma; 118 struct anon_vma_chain *avc; 119 120 might_sleep(); 121 if (unlikely(!anon_vma)) { 122 struct mm_struct *mm = vma->vm_mm; 123 struct anon_vma *allocated; 124 125 avc = anon_vma_chain_alloc(); 126 if (!avc) 127 goto out_enomem; 128 129 anon_vma = find_mergeable_anon_vma(vma); 130 allocated = NULL; 131 if (!anon_vma) { 132 anon_vma = anon_vma_alloc(); 133 if (unlikely(!anon_vma)) 134 goto out_enomem_free_avc; 135 allocated = anon_vma; 136 } 137 138 spin_lock(&anon_vma->lock); 139 /* page_table_lock to protect against threads */ 140 spin_lock(&mm->page_table_lock); 141 if (likely(!vma->anon_vma)) { 142 vma->anon_vma = anon_vma; 143 avc->anon_vma = anon_vma; 144 avc->vma = vma; 145 list_add(&avc->same_vma, &vma->anon_vma_chain); 146 list_add(&avc->same_anon_vma, &anon_vma->head); 147 allocated = NULL; 148 avc = NULL; 149 } 150 spin_unlock(&mm->page_table_lock); 151 spin_unlock(&anon_vma->lock); 152 153 if (unlikely(allocated)) 154 anon_vma_free(allocated); 155 if (unlikely(avc)) 156 anon_vma_chain_free(avc); 157 } 158 return 0; 159 160 out_enomem_free_avc: 161 anon_vma_chain_free(avc); 162 out_enomem: 163 return -ENOMEM; 164} 165 166static void anon_vma_chain_link(struct vm_area_struct *vma, 167 struct anon_vma_chain *avc, 168 struct anon_vma *anon_vma) 169{ 170 avc->vma = vma; 171 avc->anon_vma = anon_vma; 172 list_add(&avc->same_vma, &vma->anon_vma_chain); 173 174 spin_lock(&anon_vma->lock); 175 list_add_tail(&avc->same_anon_vma, &anon_vma->head); 176 spin_unlock(&anon_vma->lock); 177} 178 179/* 180 * Attach the anon_vmas from src to dst. 181 * Returns 0 on success, -ENOMEM on failure. 182 */ 183int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) 184{ 185 struct anon_vma_chain *avc, *pavc; 186 187 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { 188 avc = anon_vma_chain_alloc(); 189 if (!avc) 190 goto enomem_failure; 191 anon_vma_chain_link(dst, avc, pavc->anon_vma); 192 } 193 return 0; 194 195 enomem_failure: 196 unlink_anon_vmas(dst); 197 return -ENOMEM; 198} 199 200/* 201 * Attach vma to its own anon_vma, as well as to the anon_vmas that 202 * the corresponding VMA in the parent process is attached to. 203 * Returns 0 on success, non-zero on failure. 204 */ 205int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) 206{ 207 struct anon_vma_chain *avc; 208 struct anon_vma *anon_vma; 209 210 /* Don't bother if the parent process has no anon_vma here. */ 211 if (!pvma->anon_vma) 212 return 0; 213 214 /* 215 * First, attach the new VMA to the parent VMA's anon_vmas, 216 * so rmap can find non-COWed pages in child processes. 217 */ 218 if (anon_vma_clone(vma, pvma)) 219 return -ENOMEM; 220 221 /* Then add our own anon_vma. */ 222 anon_vma = anon_vma_alloc(); 223 if (!anon_vma) 224 goto out_error; 225 avc = anon_vma_chain_alloc(); 226 if (!avc) 227 goto out_error_free_anon_vma; 228 anon_vma_chain_link(vma, avc, anon_vma); 229 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 230 vma->anon_vma = anon_vma; 231 232 return 0; 233 234 out_error_free_anon_vma: 235 anon_vma_free(anon_vma); 236 out_error: 237 unlink_anon_vmas(vma); 238 return -ENOMEM; 239} 240 241static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) 242{ 243 struct anon_vma *anon_vma = anon_vma_chain->anon_vma; 244 int empty; 245 246 /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */ 247 if (!anon_vma) 248 return; 249 250 spin_lock(&anon_vma->lock); 251 list_del(&anon_vma_chain->same_anon_vma); 252 253 /* We must garbage collect the anon_vma if it's empty */ 254 empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma); 255 spin_unlock(&anon_vma->lock); 256 257 if (empty) 258 anon_vma_free(anon_vma); 259} 260 261void unlink_anon_vmas(struct vm_area_struct *vma) 262{ 263 struct anon_vma_chain *avc, *next; 264 265 /* Unlink each anon_vma chained to the VMA. */ 266 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 267 anon_vma_unlink(avc); 268 list_del(&avc->same_vma); 269 anon_vma_chain_free(avc); 270 } 271} 272 273static void anon_vma_ctor(void *data) 274{ 275 struct anon_vma *anon_vma = data; 276 277 spin_lock_init(&anon_vma->lock); 278 anonvma_external_refcount_init(anon_vma); 279 INIT_LIST_HEAD(&anon_vma->head); 280} 281 282void __init anon_vma_init(void) 283{ 284 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 285 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); 286 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); 287} 288 289/* 290 * Getting a lock on a stable anon_vma from a page off the LRU is 291 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 292 */ 293struct anon_vma *page_lock_anon_vma(struct page *page) 294{ 295 struct anon_vma *anon_vma; 296 unsigned long anon_mapping; 297 298 rcu_read_lock(); 299 anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); 300 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 301 goto out; 302 if (!page_mapped(page)) 303 goto out; 304 305 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 306 spin_lock(&anon_vma->lock); 307 return anon_vma; 308out: 309 rcu_read_unlock(); 310 return NULL; 311} 312 313void page_unlock_anon_vma(struct anon_vma *anon_vma) 314{ 315 spin_unlock(&anon_vma->lock); 316 rcu_read_unlock(); 317} 318 319/* 320 * At what user virtual address is page expected in @vma? 321 * Returns virtual address or -EFAULT if page's index/offset is not 322 * within the range mapped the @vma. 323 */ 324static inline unsigned long 325vma_address(struct page *page, struct vm_area_struct *vma) 326{ 327 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 328 unsigned long address; 329 330 if (unlikely(is_vm_hugetlb_page(vma))) 331 pgoff = page->index << huge_page_order(page_hstate(page)); 332 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 333 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 334 /* page should be within @vma mapping range */ 335 return -EFAULT; 336 } 337 return address; 338} 339 340/* 341 * At what user virtual address is page expected in vma? 342 * Caller should check the page is actually part of the vma. 343 */ 344unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 345{ 346 if (PageAnon(page)) 347 ; 348 else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 349 if (!vma->vm_file || 350 vma->vm_file->f_mapping != page->mapping) 351 return -EFAULT; 352 } else 353 return -EFAULT; 354 return vma_address(page, vma); 355} 356 357/* 358 * Check that @page is mapped at @address into @mm. 359 * 360 * If @sync is false, page_check_address may perform a racy check to avoid 361 * the page table lock when the pte is not present (helpful when reclaiming 362 * highly shared pages). 363 * 364 * On success returns with pte mapped and locked. 365 */ 366pte_t *page_check_address(struct page *page, struct mm_struct *mm, 367 unsigned long address, spinlock_t **ptlp, int sync) 368{ 369 pgd_t *pgd; 370 pud_t *pud; 371 pmd_t *pmd; 372 pte_t *pte; 373 spinlock_t *ptl; 374 375 if (unlikely(PageHuge(page))) { 376 pte = huge_pte_offset(mm, address); 377 ptl = &mm->page_table_lock; 378 goto check; 379 } 380 381 pgd = pgd_offset(mm, address); 382 if (!pgd_present(*pgd)) 383 return NULL; 384 385 pud = pud_offset(pgd, address); 386 if (!pud_present(*pud)) 387 return NULL; 388 389 pmd = pmd_offset(pud, address); 390 if (!pmd_present(*pmd)) 391 return NULL; 392 393 pte = pte_offset_map(pmd, address); 394 /* Make a quick check before getting the lock */ 395 if (!sync && !pte_present(*pte)) { 396 pte_unmap(pte); 397 return NULL; 398 } 399 400 ptl = pte_lockptr(mm, pmd); 401check: 402 spin_lock(ptl); 403 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { 404 *ptlp = ptl; 405 return pte; 406 } 407 pte_unmap_unlock(pte, ptl); 408 return NULL; 409} 410 411/** 412 * page_mapped_in_vma - check whether a page is really mapped in a VMA 413 * @page: the page to test 414 * @vma: the VMA to test 415 * 416 * Returns 1 if the page is mapped into the page tables of the VMA, 0 417 * if the page is not mapped into the page tables of this VMA. Only 418 * valid for normal file or anonymous VMAs. 419 */ 420int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) 421{ 422 unsigned long address; 423 pte_t *pte; 424 spinlock_t *ptl; 425 426 address = vma_address(page, vma); 427 if (address == -EFAULT) /* out of vma range */ 428 return 0; 429 pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); 430 if (!pte) /* the page is not in this mm */ 431 return 0; 432 pte_unmap_unlock(pte, ptl); 433 434 return 1; 435} 436 437/* 438 * Subfunctions of page_referenced: page_referenced_one called 439 * repeatedly from either page_referenced_anon or page_referenced_file. 440 */ 441int page_referenced_one(struct page *page, struct vm_area_struct *vma, 442 unsigned long address, unsigned int *mapcount, 443 unsigned long *vm_flags) 444{ 445 struct mm_struct *mm = vma->vm_mm; 446 pte_t *pte; 447 spinlock_t *ptl; 448 int referenced = 0; 449 450 pte = page_check_address(page, mm, address, &ptl, 0); 451 if (!pte) 452 goto out; 453 454 /* 455 * Don't want to elevate referenced for mlocked page that gets this far, 456 * in order that it progresses to try_to_unmap and is moved to the 457 * unevictable list. 458 */ 459 if (vma->vm_flags & VM_LOCKED) { 460 *mapcount = 1; /* break early from loop */ 461 *vm_flags |= VM_LOCKED; 462 goto out_unmap; 463 } 464 465 if (ptep_clear_flush_young_notify(vma, address, pte)) { 466 /* 467 * Don't treat a reference through a sequentially read 468 * mapping as such. If the page has been used in 469 * another mapping, we will catch it; if this other 470 * mapping is already gone, the unmap path will have 471 * set PG_referenced or activated the page. 472 */ 473 if (likely(!VM_SequentialReadHint(vma))) 474 referenced++; 475 } 476 477 /* Pretend the page is referenced if the task has the 478 swap token and is in the middle of a page fault. */ 479 if (mm != current->mm && has_swap_token(mm) && 480 rwsem_is_locked(&mm->mmap_sem)) 481 referenced++; 482 483out_unmap: 484 (*mapcount)--; 485 pte_unmap_unlock(pte, ptl); 486 487 if (referenced) 488 *vm_flags |= vma->vm_flags; 489out: 490 return referenced; 491} 492 493static int page_referenced_anon(struct page *page, 494 struct mem_cgroup *mem_cont, 495 unsigned long *vm_flags) 496{ 497 unsigned int mapcount; 498 struct anon_vma *anon_vma; 499 struct anon_vma_chain *avc; 500 int referenced = 0; 501 502 anon_vma = page_lock_anon_vma(page); 503 if (!anon_vma) 504 return referenced; 505 506 mapcount = page_mapcount(page); 507 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 508 struct vm_area_struct *vma = avc->vma; 509 unsigned long address = vma_address(page, vma); 510 if (address == -EFAULT) 511 continue; 512 /* 513 * If we are reclaiming on behalf of a cgroup, skip 514 * counting on behalf of references from different 515 * cgroups 516 */ 517 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 518 continue; 519 referenced += page_referenced_one(page, vma, address, 520 &mapcount, vm_flags); 521 if (!mapcount) 522 break; 523 } 524 525 page_unlock_anon_vma(anon_vma); 526 return referenced; 527} 528 529/** 530 * page_referenced_file - referenced check for object-based rmap 531 * @page: the page we're checking references on. 532 * @mem_cont: target memory controller 533 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page 534 * 535 * For an object-based mapped page, find all the places it is mapped and 536 * check/clear the referenced flag. This is done by following the page->mapping 537 * pointer, then walking the chain of vmas it holds. It returns the number 538 * of references it found. 539 * 540 * This function is only called from page_referenced for object-based pages. 541 */ 542static int page_referenced_file(struct page *page, 543 struct mem_cgroup *mem_cont, 544 unsigned long *vm_flags) 545{ 546 unsigned int mapcount; 547 struct address_space *mapping = page->mapping; 548 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 549 struct vm_area_struct *vma; 550 struct prio_tree_iter iter; 551 int referenced = 0; 552 553 /* 554 * The caller's checks on page->mapping and !PageAnon have made 555 * sure that this is a file page: the check for page->mapping 556 * excludes the case just before it gets set on an anon page. 557 */ 558 BUG_ON(PageAnon(page)); 559 560 /* 561 * The page lock not only makes sure that page->mapping cannot 562 * suddenly be NULLified by truncation, it makes sure that the 563 * structure at mapping cannot be freed and reused yet, 564 * so we can safely take mapping->i_mmap_lock. 565 */ 566 BUG_ON(!PageLocked(page)); 567 568 spin_lock(&mapping->i_mmap_lock); 569 570 /* 571 * i_mmap_lock does not stabilize mapcount at all, but mapcount 572 * is more likely to be accurate if we note it after spinning. 573 */ 574 mapcount = page_mapcount(page); 575 576 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 577 unsigned long address = vma_address(page, vma); 578 if (address == -EFAULT) 579 continue; 580 /* 581 * If we are reclaiming on behalf of a cgroup, skip 582 * counting on behalf of references from different 583 * cgroups 584 */ 585 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 586 continue; 587 referenced += page_referenced_one(page, vma, address, 588 &mapcount, vm_flags); 589 if (!mapcount) 590 break; 591 } 592 593 spin_unlock(&mapping->i_mmap_lock); 594 return referenced; 595} 596 597/** 598 * page_referenced - test if the page was referenced 599 * @page: the page to test 600 * @is_locked: caller holds lock on the page 601 * @mem_cont: target memory controller 602 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page 603 * 604 * Quick test_and_clear_referenced for all mappings to a page, 605 * returns the number of ptes which referenced the page. 606 */ 607int page_referenced(struct page *page, 608 int is_locked, 609 struct mem_cgroup *mem_cont, 610 unsigned long *vm_flags) 611{ 612 int referenced = 0; 613 int we_locked = 0; 614 615 *vm_flags = 0; 616 if (page_mapped(page) && page_rmapping(page)) { 617 if (!is_locked && (!PageAnon(page) || PageKsm(page))) { 618 we_locked = trylock_page(page); 619 if (!we_locked) { 620 referenced++; 621 goto out; 622 } 623 } 624 if (unlikely(PageKsm(page))) 625 referenced += page_referenced_ksm(page, mem_cont, 626 vm_flags); 627 else if (PageAnon(page)) 628 referenced += page_referenced_anon(page, mem_cont, 629 vm_flags); 630 else if (page->mapping) 631 referenced += page_referenced_file(page, mem_cont, 632 vm_flags); 633 if (we_locked) 634 unlock_page(page); 635 } 636out: 637 if (page_test_and_clear_young(page)) 638 referenced++; 639 640 return referenced; 641} 642 643static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, 644 unsigned long address) 645{ 646 struct mm_struct *mm = vma->vm_mm; 647 pte_t *pte; 648 spinlock_t *ptl; 649 int ret = 0; 650 651 pte = page_check_address(page, mm, address, &ptl, 1); 652 if (!pte) 653 goto out; 654 655 if (pte_dirty(*pte) || pte_write(*pte)) { 656 pte_t entry; 657 658 flush_cache_page(vma, address, pte_pfn(*pte)); 659 entry = ptep_clear_flush_notify(vma, address, pte); 660 entry = pte_wrprotect(entry); 661 entry = pte_mkclean(entry); 662 set_pte_at(mm, address, pte, entry); 663 ret = 1; 664 } 665 666 pte_unmap_unlock(pte, ptl); 667out: 668 return ret; 669} 670 671static int page_mkclean_file(struct address_space *mapping, struct page *page) 672{ 673 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 674 struct vm_area_struct *vma; 675 struct prio_tree_iter iter; 676 int ret = 0; 677 678 BUG_ON(PageAnon(page)); 679 680 spin_lock(&mapping->i_mmap_lock); 681 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 682 if (vma->vm_flags & VM_SHARED) { 683 unsigned long address = vma_address(page, vma); 684 if (address == -EFAULT) 685 continue; 686 ret += page_mkclean_one(page, vma, address); 687 } 688 } 689 spin_unlock(&mapping->i_mmap_lock); 690 return ret; 691} 692 693int page_mkclean(struct page *page) 694{ 695 int ret = 0; 696 697 BUG_ON(!PageLocked(page)); 698 699 if (page_mapped(page)) { 700 struct address_space *mapping = page_mapping(page); 701 if (mapping) { 702 ret = page_mkclean_file(mapping, page); 703 if (page_test_dirty(page)) { 704 page_clear_dirty(page); 705 ret = 1; 706 } 707 } 708 } 709 710 return ret; 711} 712EXPORT_SYMBOL_GPL(page_mkclean); 713 714/** 715 * page_move_anon_rmap - move a page to our anon_vma 716 * @page: the page to move to our anon_vma 717 * @vma: the vma the page belongs to 718 * @address: the user virtual address mapped 719 * 720 * When a page belongs exclusively to one process after a COW event, 721 * that page can be moved into the anon_vma that belongs to just that 722 * process, so the rmap code will not search the parent or sibling 723 * processes. 724 */ 725void page_move_anon_rmap(struct page *page, 726 struct vm_area_struct *vma, unsigned long address) 727{ 728 struct anon_vma *anon_vma = vma->anon_vma; 729 730 VM_BUG_ON(!PageLocked(page)); 731 VM_BUG_ON(!anon_vma); 732 VM_BUG_ON(page->index != linear_page_index(vma, address)); 733 734 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 735 page->mapping = (struct address_space *) anon_vma; 736} 737 738/** 739 * __page_set_anon_rmap - setup new anonymous rmap 740 * @page: the page to add the mapping to 741 * @vma: the vm area in which the mapping is added 742 * @address: the user virtual address mapped 743 * @exclusive: the page is exclusively owned by the current process 744 */ 745static void __page_set_anon_rmap(struct page *page, 746 struct vm_area_struct *vma, unsigned long address, int exclusive) 747{ 748 struct anon_vma *anon_vma = vma->anon_vma; 749 750 BUG_ON(!anon_vma); 751 752 /* 753 * If the page isn't exclusively mapped into this vma, 754 * we must use the _oldest_ possible anon_vma for the 755 * page mapping! 756 * 757 * So take the last AVC chain entry in the vma, which is 758 * the deepest ancestor, and use the anon_vma from that. 759 */ 760 if (!exclusive) { 761 struct anon_vma_chain *avc; 762 avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma); 763 anon_vma = avc->anon_vma; 764 } 765 766 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 767 page->mapping = (struct address_space *) anon_vma; 768 page->index = linear_page_index(vma, address); 769} 770 771/** 772 * __page_check_anon_rmap - sanity check anonymous rmap addition 773 * @page: the page to add the mapping to 774 * @vma: the vm area in which the mapping is added 775 * @address: the user virtual address mapped 776 */ 777static void __page_check_anon_rmap(struct page *page, 778 struct vm_area_struct *vma, unsigned long address) 779{ 780#ifdef CONFIG_DEBUG_VM 781 /* 782 * The page's anon-rmap details (mapping and index) are guaranteed to 783 * be set up correctly at this point. 784 * 785 * We have exclusion against page_add_anon_rmap because the caller 786 * always holds the page locked, except if called from page_dup_rmap, 787 * in which case the page is already known to be setup. 788 * 789 * We have exclusion against page_add_new_anon_rmap because those pages 790 * are initially only visible via the pagetables, and the pte is locked 791 * over the call to page_add_new_anon_rmap. 792 */ 793 BUG_ON(page->index != linear_page_index(vma, address)); 794#endif 795} 796 797/** 798 * page_add_anon_rmap - add pte mapping to an anonymous page 799 * @page: the page to add the mapping to 800 * @vma: the vm area in which the mapping is added 801 * @address: the user virtual address mapped 802 * 803 * The caller needs to hold the pte lock, and the page must be locked in 804 * the anon_vma case: to serialize mapping,index checking after setting, 805 * and to ensure that PageAnon is not being upgraded racily to PageKsm 806 * (but PageKsm is never downgraded to PageAnon). 807 */ 808void page_add_anon_rmap(struct page *page, 809 struct vm_area_struct *vma, unsigned long address) 810{ 811 int first = atomic_inc_and_test(&page->_mapcount); 812 if (first) 813 __inc_zone_page_state(page, NR_ANON_PAGES); 814 if (unlikely(PageKsm(page))) 815 return; 816 817 VM_BUG_ON(!PageLocked(page)); 818 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 819 if (first) 820 __page_set_anon_rmap(page, vma, address, 0); 821 else 822 __page_check_anon_rmap(page, vma, address); 823} 824 825/** 826 * page_add_new_anon_rmap - add pte mapping to a new anonymous page 827 * @page: the page to add the mapping to 828 * @vma: the vm area in which the mapping is added 829 * @address: the user virtual address mapped 830 * 831 * Same as page_add_anon_rmap but must only be called on *new* pages. 832 * This means the inc-and-test can be bypassed. 833 * Page does not have to be locked. 834 */ 835void page_add_new_anon_rmap(struct page *page, 836 struct vm_area_struct *vma, unsigned long address) 837{ 838 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 839 SetPageSwapBacked(page); 840 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ 841 __inc_zone_page_state(page, NR_ANON_PAGES); 842 __page_set_anon_rmap(page, vma, address, 1); 843 if (page_evictable(page, vma)) 844 lru_cache_add_lru(page, LRU_ACTIVE_ANON); 845 else 846 add_page_to_unevictable_list(page); 847} 848 849/** 850 * page_add_file_rmap - add pte mapping to a file page 851 * @page: the page to add the mapping to 852 * 853 * The caller needs to hold the pte lock. 854 */ 855void page_add_file_rmap(struct page *page) 856{ 857 if (atomic_inc_and_test(&page->_mapcount)) { 858 __inc_zone_page_state(page, NR_FILE_MAPPED); 859 mem_cgroup_update_file_mapped(page, 1); 860 } 861} 862 863/** 864 * page_remove_rmap - take down pte mapping from a page 865 * @page: page to remove mapping from 866 * 867 * The caller needs to hold the pte lock. 868 */ 869void page_remove_rmap(struct page *page) 870{ 871 /* page still mapped by someone else? */ 872 if (!atomic_add_negative(-1, &page->_mapcount)) 873 return; 874 875 /* 876 * Now that the last pte has gone, s390 must transfer dirty 877 * flag from storage key to struct page. We can usually skip 878 * this if the page is anon, so about to be freed; but perhaps 879 * not if it's in swapcache - there might be another pte slot 880 * containing the swap entry, but page not yet written to swap. 881 */ 882 if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) { 883 page_clear_dirty(page); 884 set_page_dirty(page); 885 } 886 /* 887 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED 888 * and not charged by memcg for now. 889 */ 890 if (unlikely(PageHuge(page))) 891 return; 892 if (PageAnon(page)) { 893 mem_cgroup_uncharge_page(page); 894 __dec_zone_page_state(page, NR_ANON_PAGES); 895 } else { 896 __dec_zone_page_state(page, NR_FILE_MAPPED); 897 mem_cgroup_update_file_mapped(page, -1); 898 } 899 /* 900 * It would be tidy to reset the PageAnon mapping here, 901 * but that might overwrite a racing page_add_anon_rmap 902 * which increments mapcount after us but sets mapping 903 * before us: so leave the reset to free_hot_cold_page, 904 * and remember that it's only reliable while mapped. 905 * Leaving it set also helps swapoff to reinstate ptes 906 * faster for those pages still in swapcache. 907 */ 908} 909 910/* 911 * Subfunctions of try_to_unmap: try_to_unmap_one called 912 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 913 */ 914int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 915 unsigned long address, enum ttu_flags flags) 916{ 917 struct mm_struct *mm = vma->vm_mm; 918 pte_t *pte; 919 pte_t pteval; 920 spinlock_t *ptl; 921 int ret = SWAP_AGAIN; 922 923 pte = page_check_address(page, mm, address, &ptl, 0); 924 if (!pte) 925 goto out; 926 927 /* 928 * If the page is mlock()d, we cannot swap it out. 929 * If it's recently referenced (perhaps page_referenced 930 * skipped over this mm) then we should reactivate it. 931 */ 932 if (!(flags & TTU_IGNORE_MLOCK)) { 933 if (vma->vm_flags & VM_LOCKED) 934 goto out_mlock; 935 936 if (TTU_ACTION(flags) == TTU_MUNLOCK) 937 goto out_unmap; 938 } 939 if (!(flags & TTU_IGNORE_ACCESS)) { 940 if (ptep_clear_flush_young_notify(vma, address, pte)) { 941 ret = SWAP_FAIL; 942 goto out_unmap; 943 } 944 } 945 946 /* Nuke the page table entry. */ 947 flush_cache_page(vma, address, page_to_pfn(page)); 948 pteval = ptep_clear_flush_notify(vma, address, pte); 949 950 /* Move the dirty bit to the physical page now the pte is gone. */ 951 if (pte_dirty(pteval)) 952 set_page_dirty(page); 953 954 /* Update high watermark before we lower rss */ 955 update_hiwater_rss(mm); 956 957 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { 958 if (PageAnon(page)) 959 dec_mm_counter(mm, MM_ANONPAGES); 960 else 961 dec_mm_counter(mm, MM_FILEPAGES); 962 set_pte_at(mm, address, pte, 963 swp_entry_to_pte(make_hwpoison_entry(page))); 964 } else if (PageAnon(page)) { 965 swp_entry_t entry = { .val = page_private(page) }; 966 967 if (PageSwapCache(page)) { 968 /* 969 * Store the swap location in the pte. 970 * See handle_pte_fault() ... 971 */ 972 if (swap_duplicate(entry) < 0) { 973 set_pte_at(mm, address, pte, pteval); 974 ret = SWAP_FAIL; 975 goto out_unmap; 976 } 977 if (list_empty(&mm->mmlist)) { 978 spin_lock(&mmlist_lock); 979 if (list_empty(&mm->mmlist)) 980 list_add(&mm->mmlist, &init_mm.mmlist); 981 spin_unlock(&mmlist_lock); 982 } 983 dec_mm_counter(mm, MM_ANONPAGES); 984 inc_mm_counter(mm, MM_SWAPENTS); 985 } else if (PAGE_MIGRATION) { 986 /* 987 * Store the pfn of the page in a special migration 988 * pte. do_swap_page() will wait until the migration 989 * pte is removed and then restart fault handling. 990 */ 991 BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); 992 entry = make_migration_entry(page, pte_write(pteval)); 993 } 994 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 995 BUG_ON(pte_file(*pte)); 996 } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) { 997 /* Establish migration entry for a file page */ 998 swp_entry_t entry; 999 entry = make_migration_entry(page, pte_write(pteval)); 1000 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 1001 } else 1002 dec_mm_counter(mm, MM_FILEPAGES); 1003 1004 page_remove_rmap(page); 1005 page_cache_release(page); 1006 1007out_unmap: 1008 pte_unmap_unlock(pte, ptl); 1009out: 1010 return ret; 1011 1012out_mlock: 1013 pte_unmap_unlock(pte, ptl); 1014 1015 1016 /* 1017 * We need mmap_sem locking, Otherwise VM_LOCKED check makes 1018 * unstable result and race. Plus, We can't wait here because 1019 * we now hold anon_vma->lock or mapping->i_mmap_lock. 1020 * if trylock failed, the page remain in evictable lru and later 1021 * vmscan could retry to move the page to unevictable lru if the 1022 * page is actually mlocked. 1023 */ 1024 if (down_read_trylock(&vma->vm_mm->mmap_sem)) { 1025 if (vma->vm_flags & VM_LOCKED) { 1026 mlock_vma_page(page); 1027 ret = SWAP_MLOCK; 1028 } 1029 up_read(&vma->vm_mm->mmap_sem); 1030 } 1031 return ret; 1032} 1033 1034/* 1035 * objrmap doesn't work for nonlinear VMAs because the assumption that 1036 * offset-into-file correlates with offset-into-virtual-addresses does not hold. 1037 * Consequently, given a particular page and its ->index, we cannot locate the 1038 * ptes which are mapping that page without an exhaustive linear search. 1039 * 1040 * So what this code does is a mini "virtual scan" of each nonlinear VMA which 1041 * maps the file to which the target page belongs. The ->vm_private_data field 1042 * holds the current cursor into that scan. Successive searches will circulate 1043 * around the vma's virtual address space. 1044 * 1045 * So as more replacement pressure is applied to the pages in a nonlinear VMA, 1046 * more scanning pressure is placed against them as well. Eventually pages 1047 * will become fully unmapped and are eligible for eviction. 1048 * 1049 * For very sparsely populated VMAs this is a little inefficient - chances are 1050 * there there won't be many ptes located within the scan cluster. In this case 1051 * maybe we could scan further - to the end of the pte page, perhaps. 1052 * 1053 * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can 1054 * acquire it without blocking. If vma locked, mlock the pages in the cluster, 1055 * rather than unmapping them. If we encounter the "check_page" that vmscan is 1056 * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN. 1057 */ 1058#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) 1059#define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) 1060 1061static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, 1062 struct vm_area_struct *vma, struct page *check_page) 1063{ 1064 struct mm_struct *mm = vma->vm_mm; 1065 pgd_t *pgd; 1066 pud_t *pud; 1067 pmd_t *pmd; 1068 pte_t *pte; 1069 pte_t pteval; 1070 spinlock_t *ptl; 1071 struct page *page; 1072 unsigned long address; 1073 unsigned long end; 1074 int ret = SWAP_AGAIN; 1075 int locked_vma = 0; 1076 1077 address = (vma->vm_start + cursor) & CLUSTER_MASK; 1078 end = address + CLUSTER_SIZE; 1079 if (address < vma->vm_start) 1080 address = vma->vm_start; 1081 if (end > vma->vm_end) 1082 end = vma->vm_end; 1083 1084 pgd = pgd_offset(mm, address); 1085 if (!pgd_present(*pgd)) 1086 return ret; 1087 1088 pud = pud_offset(pgd, address); 1089 if (!pud_present(*pud)) 1090 return ret; 1091 1092 pmd = pmd_offset(pud, address); 1093 if (!pmd_present(*pmd)) 1094 return ret; 1095 1096 /* 1097 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, 1098 * keep the sem while scanning the cluster for mlocking pages. 1099 */ 1100 if (down_read_trylock(&vma->vm_mm->mmap_sem)) { 1101 locked_vma = (vma->vm_flags & VM_LOCKED); 1102 if (!locked_vma) 1103 up_read(&vma->vm_mm->mmap_sem); /* don't need it */ 1104 } 1105 1106 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 1107 1108 /* Update high watermark before we lower rss */ 1109 update_hiwater_rss(mm); 1110 1111 for (; address < end; pte++, address += PAGE_SIZE) { 1112 if (!pte_present(*pte)) 1113 continue; 1114 page = vm_normal_page(vma, address, *pte); 1115 BUG_ON(!page || PageAnon(page)); 1116 1117 if (locked_vma) { 1118 mlock_vma_page(page); /* no-op if already mlocked */ 1119 if (page == check_page) 1120 ret = SWAP_MLOCK; 1121 continue; /* don't unmap */ 1122 } 1123 1124 if (ptep_clear_flush_young_notify(vma, address, pte)) 1125 continue; 1126 1127 /* Nuke the page table entry. */ 1128 flush_cache_page(vma, address, pte_pfn(*pte)); 1129 pteval = ptep_clear_flush_notify(vma, address, pte); 1130 1131 /* If nonlinear, store the file page offset in the pte. */ 1132 if (page->index != linear_page_index(vma, address)) 1133 set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); 1134 1135 /* Move the dirty bit to the physical page now the pte is gone. */ 1136 if (pte_dirty(pteval)) 1137 set_page_dirty(page); 1138 1139 page_remove_rmap(page); 1140 page_cache_release(page); 1141 dec_mm_counter(mm, MM_FILEPAGES); 1142 (*mapcount)--; 1143 } 1144 pte_unmap_unlock(pte - 1, ptl); 1145 if (locked_vma) 1146 up_read(&vma->vm_mm->mmap_sem); 1147 return ret; 1148} 1149 1150static bool is_vma_temporary_stack(struct vm_area_struct *vma) 1151{ 1152 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); 1153 1154 if (!maybe_stack) 1155 return false; 1156 1157 if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == 1158 VM_STACK_INCOMPLETE_SETUP) 1159 return true; 1160 1161 return false; 1162} 1163 1164/** 1165 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based 1166 * rmap method 1167 * @page: the page to unmap/unlock 1168 * @flags: action and flags 1169 * 1170 * Find all the mappings of a page using the mapping pointer and the vma chains 1171 * contained in the anon_vma struct it points to. 1172 * 1173 * This function is only called from try_to_unmap/try_to_munlock for 1174 * anonymous pages. 1175 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1176 * where the page was found will be held for write. So, we won't recheck 1177 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1178 * 'LOCKED. 1179 */ 1180static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) 1181{ 1182 struct anon_vma *anon_vma; 1183 struct anon_vma_chain *avc; 1184 int ret = SWAP_AGAIN; 1185 1186 anon_vma = page_lock_anon_vma(page); 1187 if (!anon_vma) 1188 return ret; 1189 1190 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1191 struct vm_area_struct *vma = avc->vma; 1192 unsigned long address; 1193 1194 /* 1195 * During exec, a temporary VMA is setup and later moved. 1196 * The VMA is moved under the anon_vma lock but not the 1197 * page tables leading to a race where migration cannot 1198 * find the migration ptes. Rather than increasing the 1199 * locking requirements of exec(), migration skips 1200 * temporary VMAs until after exec() completes. 1201 */ 1202 if (PAGE_MIGRATION && (flags & TTU_MIGRATION) && 1203 is_vma_temporary_stack(vma)) 1204 continue; 1205 1206 address = vma_address(page, vma); 1207 if (address == -EFAULT) 1208 continue; 1209 ret = try_to_unmap_one(page, vma, address, flags); 1210 if (ret != SWAP_AGAIN || !page_mapped(page)) 1211 break; 1212 } 1213 1214 page_unlock_anon_vma(anon_vma); 1215 return ret; 1216} 1217 1218/** 1219 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method 1220 * @page: the page to unmap/unlock 1221 * @flags: action and flags 1222 * 1223 * Find all the mappings of a page using the mapping pointer and the vma chains 1224 * contained in the address_space struct it points to. 1225 * 1226 * This function is only called from try_to_unmap/try_to_munlock for 1227 * object-based pages. 1228 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1229 * where the page was found will be held for write. So, we won't recheck 1230 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1231 * 'LOCKED. 1232 */ 1233static int try_to_unmap_file(struct page *page, enum ttu_flags flags) 1234{ 1235 struct address_space *mapping = page->mapping; 1236 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1237 struct vm_area_struct *vma; 1238 struct prio_tree_iter iter; 1239 int ret = SWAP_AGAIN; 1240 unsigned long cursor; 1241 unsigned long max_nl_cursor = 0; 1242 unsigned long max_nl_size = 0; 1243 unsigned int mapcount; 1244 1245 spin_lock(&mapping->i_mmap_lock); 1246 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1247 unsigned long address = vma_address(page, vma); 1248 if (address == -EFAULT) 1249 continue; 1250 ret = try_to_unmap_one(page, vma, address, flags); 1251 if (ret != SWAP_AGAIN || !page_mapped(page)) 1252 goto out; 1253 } 1254 1255 if (list_empty(&mapping->i_mmap_nonlinear)) 1256 goto out; 1257 1258 /* 1259 * We don't bother to try to find the munlocked page in nonlinears. 1260 * It's costly. Instead, later, page reclaim logic may call 1261 * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. 1262 */ 1263 if (TTU_ACTION(flags) == TTU_MUNLOCK) 1264 goto out; 1265 1266 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1267 shared.vm_set.list) { 1268 cursor = (unsigned long) vma->vm_private_data; 1269 if (cursor > max_nl_cursor) 1270 max_nl_cursor = cursor; 1271 cursor = vma->vm_end - vma->vm_start; 1272 if (cursor > max_nl_size) 1273 max_nl_size = cursor; 1274 } 1275 1276 if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ 1277 ret = SWAP_FAIL; 1278 goto out; 1279 } 1280 1281 /* 1282 * We don't try to search for this page in the nonlinear vmas, 1283 * and page_referenced wouldn't have found it anyway. Instead 1284 * just walk the nonlinear vmas trying to age and unmap some. 1285 * The mapcount of the page we came in with is irrelevant, 1286 * but even so use it as a guide to how hard we should try? 1287 */ 1288 mapcount = page_mapcount(page); 1289 if (!mapcount) 1290 goto out; 1291 cond_resched_lock(&mapping->i_mmap_lock); 1292 1293 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; 1294 if (max_nl_cursor == 0) 1295 max_nl_cursor = CLUSTER_SIZE; 1296 1297 do { 1298 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1299 shared.vm_set.list) { 1300 cursor = (unsigned long) vma->vm_private_data; 1301 while ( cursor < max_nl_cursor && 1302 cursor < vma->vm_end - vma->vm_start) { 1303 if (try_to_unmap_cluster(cursor, &mapcount, 1304 vma, page) == SWAP_MLOCK) 1305 ret = SWAP_MLOCK; 1306 cursor += CLUSTER_SIZE; 1307 vma->vm_private_data = (void *) cursor; 1308 if ((int)mapcount <= 0) 1309 goto out; 1310 } 1311 vma->vm_private_data = (void *) max_nl_cursor; 1312 } 1313 cond_resched_lock(&mapping->i_mmap_lock); 1314 max_nl_cursor += CLUSTER_SIZE; 1315 } while (max_nl_cursor <= max_nl_size); 1316 1317 /* 1318 * Don't loop forever (perhaps all the remaining pages are 1319 * in locked vmas). Reset cursor on all unreserved nonlinear 1320 * vmas, now forgetting on which ones it had fallen behind. 1321 */ 1322 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) 1323 vma->vm_private_data = NULL; 1324out: 1325 spin_unlock(&mapping->i_mmap_lock); 1326 return ret; 1327} 1328 1329/** 1330 * try_to_unmap - try to remove all page table mappings to a page 1331 * @page: the page to get unmapped 1332 * @flags: action and flags 1333 * 1334 * Tries to remove all the page table entries which are mapping this 1335 * page, used in the pageout path. Caller must hold the page lock. 1336 * Return values are: 1337 * 1338 * SWAP_SUCCESS - we succeeded in removing all mappings 1339 * SWAP_AGAIN - we missed a mapping, try again later 1340 * SWAP_FAIL - the page is unswappable 1341 * SWAP_MLOCK - page is mlocked. 1342 */ 1343int try_to_unmap(struct page *page, enum ttu_flags flags) 1344{ 1345 int ret; 1346 1347 BUG_ON(!PageLocked(page)); 1348 1349 if (unlikely(PageKsm(page))) 1350 ret = try_to_unmap_ksm(page, flags); 1351 else if (PageAnon(page)) 1352 ret = try_to_unmap_anon(page, flags); 1353 else 1354 ret = try_to_unmap_file(page, flags); 1355 if (ret != SWAP_MLOCK && !page_mapped(page)) 1356 ret = SWAP_SUCCESS; 1357 return ret; 1358} 1359 1360/** 1361 * try_to_munlock - try to munlock a page 1362 * @page: the page to be munlocked 1363 * 1364 * Called from munlock code. Checks all of the VMAs mapping the page 1365 * to make sure nobody else has this page mlocked. The page will be 1366 * returned with PG_mlocked cleared if no other vmas have it mlocked. 1367 * 1368 * Return values are: 1369 * 1370 * SWAP_AGAIN - no vma is holding page mlocked, or, 1371 * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem 1372 * SWAP_FAIL - page cannot be located at present 1373 * SWAP_MLOCK - page is now mlocked. 1374 */ 1375int try_to_munlock(struct page *page) 1376{ 1377 VM_BUG_ON(!PageLocked(page) || PageLRU(page)); 1378 1379 if (unlikely(PageKsm(page))) 1380 return try_to_unmap_ksm(page, TTU_MUNLOCK); 1381 else if (PageAnon(page)) 1382 return try_to_unmap_anon(page, TTU_MUNLOCK); 1383 else 1384 return try_to_unmap_file(page, TTU_MUNLOCK); 1385} 1386 1387#ifdef CONFIG_MIGRATION 1388/* 1389 * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): 1390 * Called by migrate.c to remove migration ptes, but might be used more later. 1391 */ 1392static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, 1393 struct vm_area_struct *, unsigned long, void *), void *arg) 1394{ 1395 struct anon_vma *anon_vma; 1396 struct anon_vma_chain *avc; 1397 int ret = SWAP_AGAIN; 1398 1399 /* 1400 * Note: remove_migration_ptes() cannot use page_lock_anon_vma() 1401 * because that depends on page_mapped(); but not all its usages 1402 * are holding mmap_sem. Users without mmap_sem are required to 1403 * take a reference count to prevent the anon_vma disappearing 1404 */ 1405 anon_vma = page_anon_vma(page); 1406 if (!anon_vma) 1407 return ret; 1408 spin_lock(&anon_vma->lock); 1409 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1410 struct vm_area_struct *vma = avc->vma; 1411 unsigned long address = vma_address(page, vma); 1412 if (address == -EFAULT) 1413 continue; 1414 ret = rmap_one(page, vma, address, arg); 1415 if (ret != SWAP_AGAIN) 1416 break; 1417 } 1418 spin_unlock(&anon_vma->lock); 1419 return ret; 1420} 1421 1422static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, 1423 struct vm_area_struct *, unsigned long, void *), void *arg) 1424{ 1425 struct address_space *mapping = page->mapping; 1426 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1427 struct vm_area_struct *vma; 1428 struct prio_tree_iter iter; 1429 int ret = SWAP_AGAIN; 1430 1431 if (!mapping) 1432 return ret; 1433 spin_lock(&mapping->i_mmap_lock); 1434 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1435 unsigned long address = vma_address(page, vma); 1436 if (address == -EFAULT) 1437 continue; 1438 ret = rmap_one(page, vma, address, arg); 1439 if (ret != SWAP_AGAIN) 1440 break; 1441 } 1442 /* 1443 * No nonlinear handling: being always shared, nonlinear vmas 1444 * never contain migration ptes. Decide what to do about this 1445 * limitation to linear when we need rmap_walk() on nonlinear. 1446 */ 1447 spin_unlock(&mapping->i_mmap_lock); 1448 return ret; 1449} 1450 1451int rmap_walk(struct page *page, int (*rmap_one)(struct page *, 1452 struct vm_area_struct *, unsigned long, void *), void *arg) 1453{ 1454 VM_BUG_ON(!PageLocked(page)); 1455 1456 if (unlikely(PageKsm(page))) 1457 return rmap_walk_ksm(page, rmap_one, arg); 1458 else if (PageAnon(page)) 1459 return rmap_walk_anon(page, rmap_one, arg); 1460 else 1461 return rmap_walk_file(page, rmap_one, arg); 1462} 1463#endif /* CONFIG_MIGRATION */ 1464 1465#ifdef CONFIG_HUGETLBFS 1466/* 1467 * The following three functions are for anonymous (private mapped) hugepages. 1468 * Unlike common anonymous pages, anonymous hugepages have no accounting code 1469 * and no lru code, because we handle hugepages differently from common pages. 1470 */ 1471static void __hugepage_set_anon_rmap(struct page *page, 1472 struct vm_area_struct *vma, unsigned long address, int exclusive) 1473{ 1474 struct anon_vma *anon_vma = vma->anon_vma; 1475 BUG_ON(!anon_vma); 1476 if (!exclusive) { 1477 struct anon_vma_chain *avc; 1478 avc = list_entry(vma->anon_vma_chain.prev, 1479 struct anon_vma_chain, same_vma); 1480 anon_vma = avc->anon_vma; 1481 } 1482 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 1483 page->mapping = (struct address_space *) anon_vma; 1484 page->index = linear_page_index(vma, address); 1485} 1486 1487void hugepage_add_anon_rmap(struct page *page, 1488 struct vm_area_struct *vma, unsigned long address) 1489{ 1490 struct anon_vma *anon_vma = vma->anon_vma; 1491 int first; 1492 BUG_ON(!anon_vma); 1493 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 1494 first = atomic_inc_and_test(&page->_mapcount); 1495 if (first) 1496 __hugepage_set_anon_rmap(page, vma, address, 0); 1497} 1498 1499void hugepage_add_new_anon_rmap(struct page *page, 1500 struct vm_area_struct *vma, unsigned long address) 1501{ 1502 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 1503 atomic_set(&page->_mapcount, 0); 1504 __hugepage_set_anon_rmap(page, vma, address, 1); 1505} 1506#endif /* CONFIG_HUGETLBFS */ 1507