vmscan.c revision 635697c663f38106063d5659f0cf2e45afcd4bb5
1/* 2 * linux/mm/vmscan.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 * 6 * Swap reorganised 29.12.95, Stephen Tweedie. 7 * kswapd added: 7.1.96 sct 8 * Removed kswapd_ctl limits, and swap out as many pages as needed 9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel. 10 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). 11 * Multiqueue VM started 5.8.00, Rik van Riel. 12 */ 13 14#include <linux/mm.h> 15#include <linux/module.h> 16#include <linux/gfp.h> 17#include <linux/kernel_stat.h> 18#include <linux/swap.h> 19#include <linux/pagemap.h> 20#include <linux/init.h> 21#include <linux/highmem.h> 22#include <linux/vmstat.h> 23#include <linux/file.h> 24#include <linux/writeback.h> 25#include <linux/blkdev.h> 26#include <linux/buffer_head.h> /* for try_to_release_page(), 27 buffer_heads_over_limit */ 28#include <linux/mm_inline.h> 29#include <linux/pagevec.h> 30#include <linux/backing-dev.h> 31#include <linux/rmap.h> 32#include <linux/topology.h> 33#include <linux/cpu.h> 34#include <linux/cpuset.h> 35#include <linux/compaction.h> 36#include <linux/notifier.h> 37#include <linux/rwsem.h> 38#include <linux/delay.h> 39#include <linux/kthread.h> 40#include <linux/freezer.h> 41#include <linux/memcontrol.h> 42#include <linux/delayacct.h> 43#include <linux/sysctl.h> 44#include <linux/oom.h> 45#include <linux/prefetch.h> 46 47#include <asm/tlbflush.h> 48#include <asm/div64.h> 49 50#include <linux/swapops.h> 51 52#include "internal.h" 53 54#define CREATE_TRACE_POINTS 55#include <trace/events/vmscan.h> 56 57/* 58 * reclaim_mode determines how the inactive list is shrunk 59 * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages 60 * RECLAIM_MODE_ASYNC: Do not block 61 * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback 62 * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference 63 * page from the LRU and reclaim all pages within a 64 * naturally aligned range 65 * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of 66 * order-0 pages and then compact the zone 67 */ 68typedef unsigned __bitwise__ reclaim_mode_t; 69#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) 70#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) 71#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) 72#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) 73#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) 74 75struct scan_control { 76 /* Incremented by the number of inactive pages that were scanned */ 77 unsigned long nr_scanned; 78 79 /* Number of pages freed so far during a call to shrink_zones() */ 80 unsigned long nr_reclaimed; 81 82 /* How many pages shrink_list() should reclaim */ 83 unsigned long nr_to_reclaim; 84 85 unsigned long hibernation_mode; 86 87 /* This context's GFP mask */ 88 gfp_t gfp_mask; 89 90 int may_writepage; 91 92 /* Can mapped pages be reclaimed? */ 93 int may_unmap; 94 95 /* Can pages be swapped as part of reclaim? */ 96 int may_swap; 97 98 int order; 99 100 /* 101 * Intend to reclaim enough continuous memory rather than reclaim 102 * enough amount of memory. i.e, mode for high order allocation. 103 */ 104 reclaim_mode_t reclaim_mode; 105 106 /* Which cgroup do we reclaim from */ 107 struct mem_cgroup *mem_cgroup; 108 109 /* 110 * Nodemask of nodes allowed by the caller. If NULL, all nodes 111 * are scanned. 112 */ 113 nodemask_t *nodemask; 114}; 115 116#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 117 118#ifdef ARCH_HAS_PREFETCH 119#define prefetch_prev_lru_page(_page, _base, _field) \ 120 do { \ 121 if ((_page)->lru.prev != _base) { \ 122 struct page *prev; \ 123 \ 124 prev = lru_to_page(&(_page->lru)); \ 125 prefetch(&prev->_field); \ 126 } \ 127 } while (0) 128#else 129#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) 130#endif 131 132#ifdef ARCH_HAS_PREFETCHW 133#define prefetchw_prev_lru_page(_page, _base, _field) \ 134 do { \ 135 if ((_page)->lru.prev != _base) { \ 136 struct page *prev; \ 137 \ 138 prev = lru_to_page(&(_page->lru)); \ 139 prefetchw(&prev->_field); \ 140 } \ 141 } while (0) 142#else 143#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) 144#endif 145 146/* 147 * From 0 .. 100. Higher means more swappy. 148 */ 149int vm_swappiness = 60; 150long vm_total_pages; /* The total number of pages which the VM controls */ 151 152static LIST_HEAD(shrinker_list); 153static DECLARE_RWSEM(shrinker_rwsem); 154 155#ifdef CONFIG_CGROUP_MEM_RES_CTLR 156#define scanning_global_lru(sc) (!(sc)->mem_cgroup) 157#else 158#define scanning_global_lru(sc) (1) 159#endif 160 161static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, 162 struct scan_control *sc) 163{ 164 if (!scanning_global_lru(sc)) 165 return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone); 166 167 return &zone->reclaim_stat; 168} 169 170static unsigned long zone_nr_lru_pages(struct zone *zone, 171 struct scan_control *sc, enum lru_list lru) 172{ 173 if (!scanning_global_lru(sc)) 174 return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, 175 zone_to_nid(zone), zone_idx(zone), BIT(lru)); 176 177 return zone_page_state(zone, NR_LRU_BASE + lru); 178} 179 180 181/* 182 * Add a shrinker callback to be called from the vm 183 */ 184void register_shrinker(struct shrinker *shrinker) 185{ 186 shrinker->nr = 0; 187 down_write(&shrinker_rwsem); 188 list_add_tail(&shrinker->list, &shrinker_list); 189 up_write(&shrinker_rwsem); 190} 191EXPORT_SYMBOL(register_shrinker); 192 193/* 194 * Remove one 195 */ 196void unregister_shrinker(struct shrinker *shrinker) 197{ 198 down_write(&shrinker_rwsem); 199 list_del(&shrinker->list); 200 up_write(&shrinker_rwsem); 201} 202EXPORT_SYMBOL(unregister_shrinker); 203 204static inline int do_shrinker_shrink(struct shrinker *shrinker, 205 struct shrink_control *sc, 206 unsigned long nr_to_scan) 207{ 208 sc->nr_to_scan = nr_to_scan; 209 return (*shrinker->shrink)(shrinker, sc); 210} 211 212#define SHRINK_BATCH 128 213/* 214 * Call the shrink functions to age shrinkable caches 215 * 216 * Here we assume it costs one seek to replace a lru page and that it also 217 * takes a seek to recreate a cache object. With this in mind we age equal 218 * percentages of the lru and ageable caches. This should balance the seeks 219 * generated by these structures. 220 * 221 * If the vm encountered mapped pages on the LRU it increase the pressure on 222 * slab to avoid swapping. 223 * 224 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. 225 * 226 * `lru_pages' represents the number of on-LRU pages in all the zones which 227 * are eligible for the caller's allocation attempt. It is used for balancing 228 * slab reclaim versus page reclaim. 229 * 230 * Returns the number of slab objects which we shrunk. 231 */ 232unsigned long shrink_slab(struct shrink_control *shrink, 233 unsigned long nr_pages_scanned, 234 unsigned long lru_pages) 235{ 236 struct shrinker *shrinker; 237 unsigned long ret = 0; 238 239 if (nr_pages_scanned == 0) 240 nr_pages_scanned = SWAP_CLUSTER_MAX; 241 242 if (!down_read_trylock(&shrinker_rwsem)) { 243 /* Assume we'll be able to shrink next time */ 244 ret = 1; 245 goto out; 246 } 247 248 list_for_each_entry(shrinker, &shrinker_list, list) { 249 unsigned long long delta; 250 long total_scan; 251 long max_pass; 252 int shrink_ret = 0; 253 long nr; 254 long new_nr; 255 long batch_size = shrinker->batch ? shrinker->batch 256 : SHRINK_BATCH; 257 258 max_pass = do_shrinker_shrink(shrinker, shrink, 0); 259 if (max_pass <= 0) 260 continue; 261 262 /* 263 * copy the current shrinker scan count into a local variable 264 * and zero it so that other concurrent shrinker invocations 265 * don't also do this scanning work. 266 */ 267 do { 268 nr = shrinker->nr; 269 } while (cmpxchg(&shrinker->nr, nr, 0) != nr); 270 271 total_scan = nr; 272 delta = (4 * nr_pages_scanned) / shrinker->seeks; 273 delta *= max_pass; 274 do_div(delta, lru_pages + 1); 275 total_scan += delta; 276 if (total_scan < 0) { 277 printk(KERN_ERR "shrink_slab: %pF negative objects to " 278 "delete nr=%ld\n", 279 shrinker->shrink, total_scan); 280 total_scan = max_pass; 281 } 282 283 /* 284 * We need to avoid excessive windup on filesystem shrinkers 285 * due to large numbers of GFP_NOFS allocations causing the 286 * shrinkers to return -1 all the time. This results in a large 287 * nr being built up so when a shrink that can do some work 288 * comes along it empties the entire cache due to nr >>> 289 * max_pass. This is bad for sustaining a working set in 290 * memory. 291 * 292 * Hence only allow the shrinker to scan the entire cache when 293 * a large delta change is calculated directly. 294 */ 295 if (delta < max_pass / 4) 296 total_scan = min(total_scan, max_pass / 2); 297 298 /* 299 * Avoid risking looping forever due to too large nr value: 300 * never try to free more than twice the estimate number of 301 * freeable entries. 302 */ 303 if (total_scan > max_pass * 2) 304 total_scan = max_pass * 2; 305 306 trace_mm_shrink_slab_start(shrinker, shrink, nr, 307 nr_pages_scanned, lru_pages, 308 max_pass, delta, total_scan); 309 310 while (total_scan >= batch_size) { 311 int nr_before; 312 313 nr_before = do_shrinker_shrink(shrinker, shrink, 0); 314 shrink_ret = do_shrinker_shrink(shrinker, shrink, 315 batch_size); 316 if (shrink_ret == -1) 317 break; 318 if (shrink_ret < nr_before) 319 ret += nr_before - shrink_ret; 320 count_vm_events(SLABS_SCANNED, batch_size); 321 total_scan -= batch_size; 322 323 cond_resched(); 324 } 325 326 /* 327 * move the unused scan count back into the shrinker in a 328 * manner that handles concurrent updates. If we exhausted the 329 * scan, there is no need to do an update. 330 */ 331 do { 332 nr = shrinker->nr; 333 new_nr = total_scan + nr; 334 if (total_scan <= 0) 335 break; 336 } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); 337 338 trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); 339 } 340 up_read(&shrinker_rwsem); 341out: 342 cond_resched(); 343 return ret; 344} 345 346static void set_reclaim_mode(int priority, struct scan_control *sc, 347 bool sync) 348{ 349 reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; 350 351 /* 352 * Initially assume we are entering either lumpy reclaim or 353 * reclaim/compaction.Depending on the order, we will either set the 354 * sync mode or just reclaim order-0 pages later. 355 */ 356 if (COMPACTION_BUILD) 357 sc->reclaim_mode = RECLAIM_MODE_COMPACTION; 358 else 359 sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; 360 361 /* 362 * Avoid using lumpy reclaim or reclaim/compaction if possible by 363 * restricting when its set to either costly allocations or when 364 * under memory pressure 365 */ 366 if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 367 sc->reclaim_mode |= syncmode; 368 else if (sc->order && priority < DEF_PRIORITY - 2) 369 sc->reclaim_mode |= syncmode; 370 else 371 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; 372} 373 374static void reset_reclaim_mode(struct scan_control *sc) 375{ 376 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; 377} 378 379static inline int is_page_cache_freeable(struct page *page) 380{ 381 /* 382 * A freeable page cache page is referenced only by the caller 383 * that isolated the page, the page cache radix tree and 384 * optional buffer heads at page->private. 385 */ 386 return page_count(page) - page_has_private(page) == 2; 387} 388 389static int may_write_to_queue(struct backing_dev_info *bdi, 390 struct scan_control *sc) 391{ 392 if (current->flags & PF_SWAPWRITE) 393 return 1; 394 if (!bdi_write_congested(bdi)) 395 return 1; 396 if (bdi == current->backing_dev_info) 397 return 1; 398 399 /* lumpy reclaim for hugepage often need a lot of write */ 400 if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 401 return 1; 402 return 0; 403} 404 405/* 406 * We detected a synchronous write error writing a page out. Probably 407 * -ENOSPC. We need to propagate that into the address_space for a subsequent 408 * fsync(), msync() or close(). 409 * 410 * The tricky part is that after writepage we cannot touch the mapping: nothing 411 * prevents it from being freed up. But we have a ref on the page and once 412 * that page is locked, the mapping is pinned. 413 * 414 * We're allowed to run sleeping lock_page() here because we know the caller has 415 * __GFP_FS. 416 */ 417static void handle_write_error(struct address_space *mapping, 418 struct page *page, int error) 419{ 420 lock_page(page); 421 if (page_mapping(page) == mapping) 422 mapping_set_error(mapping, error); 423 unlock_page(page); 424} 425 426/* possible outcome of pageout() */ 427typedef enum { 428 /* failed to write page out, page is locked */ 429 PAGE_KEEP, 430 /* move page to the active list, page is locked */ 431 PAGE_ACTIVATE, 432 /* page has been sent to the disk successfully, page is unlocked */ 433 PAGE_SUCCESS, 434 /* page is clean and locked */ 435 PAGE_CLEAN, 436} pageout_t; 437 438/* 439 * pageout is called by shrink_page_list() for each dirty page. 440 * Calls ->writepage(). 441 */ 442static pageout_t pageout(struct page *page, struct address_space *mapping, 443 struct scan_control *sc) 444{ 445 /* 446 * If the page is dirty, only perform writeback if that write 447 * will be non-blocking. To prevent this allocation from being 448 * stalled by pagecache activity. But note that there may be 449 * stalls if we need to run get_block(). We could test 450 * PagePrivate for that. 451 * 452 * If this process is currently in __generic_file_aio_write() against 453 * this page's queue, we can perform writeback even if that 454 * will block. 455 * 456 * If the page is swapcache, write it back even if that would 457 * block, for some throttling. This happens by accident, because 458 * swap_backing_dev_info is bust: it doesn't reflect the 459 * congestion state of the swapdevs. Easy to fix, if needed. 460 */ 461 if (!is_page_cache_freeable(page)) 462 return PAGE_KEEP; 463 if (!mapping) { 464 /* 465 * Some data journaling orphaned pages can have 466 * page->mapping == NULL while being dirty with clean buffers. 467 */ 468 if (page_has_private(page)) { 469 if (try_to_free_buffers(page)) { 470 ClearPageDirty(page); 471 printk("%s: orphaned page\n", __func__); 472 return PAGE_CLEAN; 473 } 474 } 475 return PAGE_KEEP; 476 } 477 if (mapping->a_ops->writepage == NULL) 478 return PAGE_ACTIVATE; 479 if (!may_write_to_queue(mapping->backing_dev_info, sc)) 480 return PAGE_KEEP; 481 482 if (clear_page_dirty_for_io(page)) { 483 int res; 484 struct writeback_control wbc = { 485 .sync_mode = WB_SYNC_NONE, 486 .nr_to_write = SWAP_CLUSTER_MAX, 487 .range_start = 0, 488 .range_end = LLONG_MAX, 489 .for_reclaim = 1, 490 }; 491 492 SetPageReclaim(page); 493 res = mapping->a_ops->writepage(page, &wbc); 494 if (res < 0) 495 handle_write_error(mapping, page, res); 496 if (res == AOP_WRITEPAGE_ACTIVATE) { 497 ClearPageReclaim(page); 498 return PAGE_ACTIVATE; 499 } 500 501 if (!PageWriteback(page)) { 502 /* synchronous write or broken a_ops? */ 503 ClearPageReclaim(page); 504 } 505 trace_mm_vmscan_writepage(page, 506 trace_reclaim_flags(page, sc->reclaim_mode)); 507 inc_zone_page_state(page, NR_VMSCAN_WRITE); 508 return PAGE_SUCCESS; 509 } 510 511 return PAGE_CLEAN; 512} 513 514/* 515 * Same as remove_mapping, but if the page is removed from the mapping, it 516 * gets returned with a refcount of 0. 517 */ 518static int __remove_mapping(struct address_space *mapping, struct page *page) 519{ 520 BUG_ON(!PageLocked(page)); 521 BUG_ON(mapping != page_mapping(page)); 522 523 spin_lock_irq(&mapping->tree_lock); 524 /* 525 * The non racy check for a busy page. 526 * 527 * Must be careful with the order of the tests. When someone has 528 * a ref to the page, it may be possible that they dirty it then 529 * drop the reference. So if PageDirty is tested before page_count 530 * here, then the following race may occur: 531 * 532 * get_user_pages(&page); 533 * [user mapping goes away] 534 * write_to(page); 535 * !PageDirty(page) [good] 536 * SetPageDirty(page); 537 * put_page(page); 538 * !page_count(page) [good, discard it] 539 * 540 * [oops, our write_to data is lost] 541 * 542 * Reversing the order of the tests ensures such a situation cannot 543 * escape unnoticed. The smp_rmb is needed to ensure the page->flags 544 * load is not satisfied before that of page->_count. 545 * 546 * Note that if SetPageDirty is always performed via set_page_dirty, 547 * and thus under tree_lock, then this ordering is not required. 548 */ 549 if (!page_freeze_refs(page, 2)) 550 goto cannot_free; 551 /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ 552 if (unlikely(PageDirty(page))) { 553 page_unfreeze_refs(page, 2); 554 goto cannot_free; 555 } 556 557 if (PageSwapCache(page)) { 558 swp_entry_t swap = { .val = page_private(page) }; 559 __delete_from_swap_cache(page); 560 spin_unlock_irq(&mapping->tree_lock); 561 swapcache_free(swap, page); 562 } else { 563 void (*freepage)(struct page *); 564 565 freepage = mapping->a_ops->freepage; 566 567 __delete_from_page_cache(page); 568 spin_unlock_irq(&mapping->tree_lock); 569 mem_cgroup_uncharge_cache_page(page); 570 571 if (freepage != NULL) 572 freepage(page); 573 } 574 575 return 1; 576 577cannot_free: 578 spin_unlock_irq(&mapping->tree_lock); 579 return 0; 580} 581 582/* 583 * Attempt to detach a locked page from its ->mapping. If it is dirty or if 584 * someone else has a ref on the page, abort and return 0. If it was 585 * successfully detached, return 1. Assumes the caller has a single ref on 586 * this page. 587 */ 588int remove_mapping(struct address_space *mapping, struct page *page) 589{ 590 if (__remove_mapping(mapping, page)) { 591 /* 592 * Unfreezing the refcount with 1 rather than 2 effectively 593 * drops the pagecache ref for us without requiring another 594 * atomic operation. 595 */ 596 page_unfreeze_refs(page, 1); 597 return 1; 598 } 599 return 0; 600} 601 602/** 603 * putback_lru_page - put previously isolated page onto appropriate LRU list 604 * @page: page to be put back to appropriate lru list 605 * 606 * Add previously isolated @page to appropriate LRU list. 607 * Page may still be unevictable for other reasons. 608 * 609 * lru_lock must not be held, interrupts must be enabled. 610 */ 611void putback_lru_page(struct page *page) 612{ 613 int lru; 614 int active = !!TestClearPageActive(page); 615 int was_unevictable = PageUnevictable(page); 616 617 VM_BUG_ON(PageLRU(page)); 618 619redo: 620 ClearPageUnevictable(page); 621 622 if (page_evictable(page, NULL)) { 623 /* 624 * For evictable pages, we can use the cache. 625 * In event of a race, worst case is we end up with an 626 * unevictable page on [in]active list. 627 * We know how to handle that. 628 */ 629 lru = active + page_lru_base_type(page); 630 lru_cache_add_lru(page, lru); 631 } else { 632 /* 633 * Put unevictable pages directly on zone's unevictable 634 * list. 635 */ 636 lru = LRU_UNEVICTABLE; 637 add_page_to_unevictable_list(page); 638 /* 639 * When racing with an mlock or AS_UNEVICTABLE clearing 640 * (page is unlocked) make sure that if the other thread 641 * does not observe our setting of PG_lru and fails 642 * isolation/check_move_unevictable_page, 643 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move 644 * the page back to the evictable list. 645 * 646 * The other side is TestClearPageMlocked() or shmem_lock(). 647 */ 648 smp_mb(); 649 } 650 651 /* 652 * page's status can change while we move it among lru. If an evictable 653 * page is on unevictable list, it never be freed. To avoid that, 654 * check after we added it to the list, again. 655 */ 656 if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { 657 if (!isolate_lru_page(page)) { 658 put_page(page); 659 goto redo; 660 } 661 /* This means someone else dropped this page from LRU 662 * So, it will be freed or putback to LRU again. There is 663 * nothing to do here. 664 */ 665 } 666 667 if (was_unevictable && lru != LRU_UNEVICTABLE) 668 count_vm_event(UNEVICTABLE_PGRESCUED); 669 else if (!was_unevictable && lru == LRU_UNEVICTABLE) 670 count_vm_event(UNEVICTABLE_PGCULLED); 671 672 put_page(page); /* drop ref from isolate */ 673} 674 675enum page_references { 676 PAGEREF_RECLAIM, 677 PAGEREF_RECLAIM_CLEAN, 678 PAGEREF_KEEP, 679 PAGEREF_ACTIVATE, 680}; 681 682static enum page_references page_check_references(struct page *page, 683 struct scan_control *sc) 684{ 685 int referenced_ptes, referenced_page; 686 unsigned long vm_flags; 687 688 referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); 689 referenced_page = TestClearPageReferenced(page); 690 691 /* Lumpy reclaim - ignore references */ 692 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) 693 return PAGEREF_RECLAIM; 694 695 /* 696 * Mlock lost the isolation race with us. Let try_to_unmap() 697 * move the page to the unevictable list. 698 */ 699 if (vm_flags & VM_LOCKED) 700 return PAGEREF_RECLAIM; 701 702 if (referenced_ptes) { 703 if (PageAnon(page)) 704 return PAGEREF_ACTIVATE; 705 /* 706 * All mapped pages start out with page table 707 * references from the instantiating fault, so we need 708 * to look twice if a mapped file page is used more 709 * than once. 710 * 711 * Mark it and spare it for another trip around the 712 * inactive list. Another page table reference will 713 * lead to its activation. 714 * 715 * Note: the mark is set for activated pages as well 716 * so that recently deactivated but used pages are 717 * quickly recovered. 718 */ 719 SetPageReferenced(page); 720 721 if (referenced_page) 722 return PAGEREF_ACTIVATE; 723 724 return PAGEREF_KEEP; 725 } 726 727 /* Reclaim if clean, defer dirty pages to writeback */ 728 if (referenced_page && !PageSwapBacked(page)) 729 return PAGEREF_RECLAIM_CLEAN; 730 731 return PAGEREF_RECLAIM; 732} 733 734static noinline_for_stack void free_page_list(struct list_head *free_pages) 735{ 736 struct pagevec freed_pvec; 737 struct page *page, *tmp; 738 739 pagevec_init(&freed_pvec, 1); 740 741 list_for_each_entry_safe(page, tmp, free_pages, lru) { 742 list_del(&page->lru); 743 if (!pagevec_add(&freed_pvec, page)) { 744 __pagevec_free(&freed_pvec); 745 pagevec_reinit(&freed_pvec); 746 } 747 } 748 749 pagevec_free(&freed_pvec); 750} 751 752/* 753 * shrink_page_list() returns the number of reclaimed pages 754 */ 755static unsigned long shrink_page_list(struct list_head *page_list, 756 struct zone *zone, 757 struct scan_control *sc, 758 int priority, 759 unsigned long *ret_nr_dirty, 760 unsigned long *ret_nr_writeback) 761{ 762 LIST_HEAD(ret_pages); 763 LIST_HEAD(free_pages); 764 int pgactivate = 0; 765 unsigned long nr_dirty = 0; 766 unsigned long nr_congested = 0; 767 unsigned long nr_reclaimed = 0; 768 unsigned long nr_writeback = 0; 769 770 cond_resched(); 771 772 while (!list_empty(page_list)) { 773 enum page_references references; 774 struct address_space *mapping; 775 struct page *page; 776 int may_enter_fs; 777 778 cond_resched(); 779 780 page = lru_to_page(page_list); 781 list_del(&page->lru); 782 783 if (!trylock_page(page)) 784 goto keep; 785 786 VM_BUG_ON(PageActive(page)); 787 VM_BUG_ON(page_zone(page) != zone); 788 789 sc->nr_scanned++; 790 791 if (unlikely(!page_evictable(page, NULL))) 792 goto cull_mlocked; 793 794 if (!sc->may_unmap && page_mapped(page)) 795 goto keep_locked; 796 797 /* Double the slab pressure for mapped and swapcache pages */ 798 if (page_mapped(page) || PageSwapCache(page)) 799 sc->nr_scanned++; 800 801 may_enter_fs = (sc->gfp_mask & __GFP_FS) || 802 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 803 804 if (PageWriteback(page)) { 805 nr_writeback++; 806 /* 807 * Synchronous reclaim cannot queue pages for 808 * writeback due to the possibility of stack overflow 809 * but if it encounters a page under writeback, wait 810 * for the IO to complete. 811 */ 812 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && 813 may_enter_fs) 814 wait_on_page_writeback(page); 815 else { 816 unlock_page(page); 817 goto keep_lumpy; 818 } 819 } 820 821 references = page_check_references(page, sc); 822 switch (references) { 823 case PAGEREF_ACTIVATE: 824 goto activate_locked; 825 case PAGEREF_KEEP: 826 goto keep_locked; 827 case PAGEREF_RECLAIM: 828 case PAGEREF_RECLAIM_CLEAN: 829 ; /* try to reclaim the page below */ 830 } 831 832 /* 833 * Anonymous process memory has backing store? 834 * Try to allocate it some swap space here. 835 */ 836 if (PageAnon(page) && !PageSwapCache(page)) { 837 if (!(sc->gfp_mask & __GFP_IO)) 838 goto keep_locked; 839 if (!add_to_swap(page)) 840 goto activate_locked; 841 may_enter_fs = 1; 842 } 843 844 mapping = page_mapping(page); 845 846 /* 847 * The page is mapped into the page tables of one or more 848 * processes. Try to unmap it here. 849 */ 850 if (page_mapped(page) && mapping) { 851 switch (try_to_unmap(page, TTU_UNMAP)) { 852 case SWAP_FAIL: 853 goto activate_locked; 854 case SWAP_AGAIN: 855 goto keep_locked; 856 case SWAP_MLOCK: 857 goto cull_mlocked; 858 case SWAP_SUCCESS: 859 ; /* try to free the page below */ 860 } 861 } 862 863 if (PageDirty(page)) { 864 nr_dirty++; 865 866 /* 867 * Only kswapd can writeback filesystem pages to 868 * avoid risk of stack overflow but do not writeback 869 * unless under significant pressure. 870 */ 871 if (page_is_file_cache(page) && 872 (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) { 873 /* 874 * Immediately reclaim when written back. 875 * Similar in principal to deactivate_page() 876 * except we already have the page isolated 877 * and know it's dirty 878 */ 879 inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); 880 SetPageReclaim(page); 881 882 goto keep_locked; 883 } 884 885 if (references == PAGEREF_RECLAIM_CLEAN) 886 goto keep_locked; 887 if (!may_enter_fs) 888 goto keep_locked; 889 if (!sc->may_writepage) 890 goto keep_locked; 891 892 /* Page is dirty, try to write it out here */ 893 switch (pageout(page, mapping, sc)) { 894 case PAGE_KEEP: 895 nr_congested++; 896 goto keep_locked; 897 case PAGE_ACTIVATE: 898 goto activate_locked; 899 case PAGE_SUCCESS: 900 if (PageWriteback(page)) 901 goto keep_lumpy; 902 if (PageDirty(page)) 903 goto keep; 904 905 /* 906 * A synchronous write - probably a ramdisk. Go 907 * ahead and try to reclaim the page. 908 */ 909 if (!trylock_page(page)) 910 goto keep; 911 if (PageDirty(page) || PageWriteback(page)) 912 goto keep_locked; 913 mapping = page_mapping(page); 914 case PAGE_CLEAN: 915 ; /* try to free the page below */ 916 } 917 } 918 919 /* 920 * If the page has buffers, try to free the buffer mappings 921 * associated with this page. If we succeed we try to free 922 * the page as well. 923 * 924 * We do this even if the page is PageDirty(). 925 * try_to_release_page() does not perform I/O, but it is 926 * possible for a page to have PageDirty set, but it is actually 927 * clean (all its buffers are clean). This happens if the 928 * buffers were written out directly, with submit_bh(). ext3 929 * will do this, as well as the blockdev mapping. 930 * try_to_release_page() will discover that cleanness and will 931 * drop the buffers and mark the page clean - it can be freed. 932 * 933 * Rarely, pages can have buffers and no ->mapping. These are 934 * the pages which were not successfully invalidated in 935 * truncate_complete_page(). We try to drop those buffers here 936 * and if that worked, and the page is no longer mapped into 937 * process address space (page_count == 1) it can be freed. 938 * Otherwise, leave the page on the LRU so it is swappable. 939 */ 940 if (page_has_private(page)) { 941 if (!try_to_release_page(page, sc->gfp_mask)) 942 goto activate_locked; 943 if (!mapping && page_count(page) == 1) { 944 unlock_page(page); 945 if (put_page_testzero(page)) 946 goto free_it; 947 else { 948 /* 949 * rare race with speculative reference. 950 * the speculative reference will free 951 * this page shortly, so we may 952 * increment nr_reclaimed here (and 953 * leave it off the LRU). 954 */ 955 nr_reclaimed++; 956 continue; 957 } 958 } 959 } 960 961 if (!mapping || !__remove_mapping(mapping, page)) 962 goto keep_locked; 963 964 /* 965 * At this point, we have no other references and there is 966 * no way to pick any more up (removed from LRU, removed 967 * from pagecache). Can use non-atomic bitops now (and 968 * we obviously don't have to worry about waking up a process 969 * waiting on the page lock, because there are no references. 970 */ 971 __clear_page_locked(page); 972free_it: 973 nr_reclaimed++; 974 975 /* 976 * Is there need to periodically free_page_list? It would 977 * appear not as the counts should be low 978 */ 979 list_add(&page->lru, &free_pages); 980 continue; 981 982cull_mlocked: 983 if (PageSwapCache(page)) 984 try_to_free_swap(page); 985 unlock_page(page); 986 putback_lru_page(page); 987 reset_reclaim_mode(sc); 988 continue; 989 990activate_locked: 991 /* Not a candidate for swapping, so reclaim swap space. */ 992 if (PageSwapCache(page) && vm_swap_full()) 993 try_to_free_swap(page); 994 VM_BUG_ON(PageActive(page)); 995 SetPageActive(page); 996 pgactivate++; 997keep_locked: 998 unlock_page(page); 999keep: 1000 reset_reclaim_mode(sc); 1001keep_lumpy: 1002 list_add(&page->lru, &ret_pages); 1003 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 1004 } 1005 1006 /* 1007 * Tag a zone as congested if all the dirty pages encountered were 1008 * backed by a congested BDI. In this case, reclaimers should just 1009 * back off and wait for congestion to clear because further reclaim 1010 * will encounter the same problem 1011 */ 1012 if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc)) 1013 zone_set_flag(zone, ZONE_CONGESTED); 1014 1015 free_page_list(&free_pages); 1016 1017 list_splice(&ret_pages, page_list); 1018 count_vm_events(PGACTIVATE, pgactivate); 1019 *ret_nr_dirty += nr_dirty; 1020 *ret_nr_writeback += nr_writeback; 1021 return nr_reclaimed; 1022} 1023 1024/* 1025 * Attempt to remove the specified page from its LRU. Only take this page 1026 * if it is of the appropriate PageActive status. Pages which are being 1027 * freed elsewhere are also ignored. 1028 * 1029 * page: page to consider 1030 * mode: one of the LRU isolation modes defined above 1031 * 1032 * returns 0 on success, -ve errno on failure. 1033 */ 1034int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) 1035{ 1036 bool all_lru_mode; 1037 int ret = -EINVAL; 1038 1039 /* Only take pages on the LRU. */ 1040 if (!PageLRU(page)) 1041 return ret; 1042 1043 all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) == 1044 (ISOLATE_ACTIVE|ISOLATE_INACTIVE); 1045 1046 /* 1047 * When checking the active state, we need to be sure we are 1048 * dealing with comparible boolean values. Take the logical not 1049 * of each. 1050 */ 1051 if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE)) 1052 return ret; 1053 1054 if (!all_lru_mode && !!page_is_file_cache(page) != file) 1055 return ret; 1056 1057 /* 1058 * When this function is being called for lumpy reclaim, we 1059 * initially look into all LRU pages, active, inactive and 1060 * unevictable; only give shrink_page_list evictable pages. 1061 */ 1062 if (PageUnevictable(page)) 1063 return ret; 1064 1065 ret = -EBUSY; 1066 1067 if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page))) 1068 return ret; 1069 1070 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) 1071 return ret; 1072 1073 if (likely(get_page_unless_zero(page))) { 1074 /* 1075 * Be careful not to clear PageLRU until after we're 1076 * sure the page is not being freed elsewhere -- the 1077 * page release code relies on it. 1078 */ 1079 ClearPageLRU(page); 1080 ret = 0; 1081 } 1082 1083 return ret; 1084} 1085 1086/* 1087 * zone->lru_lock is heavily contended. Some of the functions that 1088 * shrink the lists perform better by taking out a batch of pages 1089 * and working on them outside the LRU lock. 1090 * 1091 * For pagecache intensive workloads, this function is the hottest 1092 * spot in the kernel (apart from copy_*_user functions). 1093 * 1094 * Appropriate locks must be held before calling this function. 1095 * 1096 * @nr_to_scan: The number of pages to look through on the list. 1097 * @src: The LRU list to pull pages off. 1098 * @dst: The temp list to put pages on to. 1099 * @scanned: The number of pages that were scanned. 1100 * @order: The caller's attempted allocation order 1101 * @mode: One of the LRU isolation modes 1102 * @file: True [1] if isolating file [!anon] pages 1103 * 1104 * returns how many pages were moved onto *@dst. 1105 */ 1106static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1107 struct list_head *src, struct list_head *dst, 1108 unsigned long *scanned, int order, isolate_mode_t mode, 1109 int file) 1110{ 1111 unsigned long nr_taken = 0; 1112 unsigned long nr_lumpy_taken = 0; 1113 unsigned long nr_lumpy_dirty = 0; 1114 unsigned long nr_lumpy_failed = 0; 1115 unsigned long scan; 1116 1117 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 1118 struct page *page; 1119 unsigned long pfn; 1120 unsigned long end_pfn; 1121 unsigned long page_pfn; 1122 int zone_id; 1123 1124 page = lru_to_page(src); 1125 prefetchw_prev_lru_page(page, src, flags); 1126 1127 VM_BUG_ON(!PageLRU(page)); 1128 1129 switch (__isolate_lru_page(page, mode, file)) { 1130 case 0: 1131 list_move(&page->lru, dst); 1132 mem_cgroup_del_lru(page); 1133 nr_taken += hpage_nr_pages(page); 1134 break; 1135 1136 case -EBUSY: 1137 /* else it is being freed elsewhere */ 1138 list_move(&page->lru, src); 1139 mem_cgroup_rotate_lru_list(page, page_lru(page)); 1140 continue; 1141 1142 default: 1143 BUG(); 1144 } 1145 1146 if (!order) 1147 continue; 1148 1149 /* 1150 * Attempt to take all pages in the order aligned region 1151 * surrounding the tag page. Only take those pages of 1152 * the same active state as that tag page. We may safely 1153 * round the target page pfn down to the requested order 1154 * as the mem_map is guaranteed valid out to MAX_ORDER, 1155 * where that page is in a different zone we will detect 1156 * it from its zone id and abort this block scan. 1157 */ 1158 zone_id = page_zone_id(page); 1159 page_pfn = page_to_pfn(page); 1160 pfn = page_pfn & ~((1 << order) - 1); 1161 end_pfn = pfn + (1 << order); 1162 for (; pfn < end_pfn; pfn++) { 1163 struct page *cursor_page; 1164 1165 /* The target page is in the block, ignore it. */ 1166 if (unlikely(pfn == page_pfn)) 1167 continue; 1168 1169 /* Avoid holes within the zone. */ 1170 if (unlikely(!pfn_valid_within(pfn))) 1171 break; 1172 1173 cursor_page = pfn_to_page(pfn); 1174 1175 /* Check that we have not crossed a zone boundary. */ 1176 if (unlikely(page_zone_id(cursor_page) != zone_id)) 1177 break; 1178 1179 /* 1180 * If we don't have enough swap space, reclaiming of 1181 * anon page which don't already have a swap slot is 1182 * pointless. 1183 */ 1184 if (nr_swap_pages <= 0 && PageAnon(cursor_page) && 1185 !PageSwapCache(cursor_page)) 1186 break; 1187 1188 if (__isolate_lru_page(cursor_page, mode, file) == 0) { 1189 list_move(&cursor_page->lru, dst); 1190 mem_cgroup_del_lru(cursor_page); 1191 nr_taken += hpage_nr_pages(page); 1192 nr_lumpy_taken++; 1193 if (PageDirty(cursor_page)) 1194 nr_lumpy_dirty++; 1195 scan++; 1196 } else { 1197 /* 1198 * Check if the page is freed already. 1199 * 1200 * We can't use page_count() as that 1201 * requires compound_head and we don't 1202 * have a pin on the page here. If a 1203 * page is tail, we may or may not 1204 * have isolated the head, so assume 1205 * it's not free, it'd be tricky to 1206 * track the head status without a 1207 * page pin. 1208 */ 1209 if (!PageTail(cursor_page) && 1210 !atomic_read(&cursor_page->_count)) 1211 continue; 1212 break; 1213 } 1214 } 1215 1216 /* If we break out of the loop above, lumpy reclaim failed */ 1217 if (pfn < end_pfn) 1218 nr_lumpy_failed++; 1219 } 1220 1221 *scanned = scan; 1222 1223 trace_mm_vmscan_lru_isolate(order, 1224 nr_to_scan, scan, 1225 nr_taken, 1226 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, 1227 mode); 1228 return nr_taken; 1229} 1230 1231static unsigned long isolate_pages_global(unsigned long nr, 1232 struct list_head *dst, 1233 unsigned long *scanned, int order, 1234 isolate_mode_t mode, 1235 struct zone *z, int active, int file) 1236{ 1237 int lru = LRU_BASE; 1238 if (active) 1239 lru += LRU_ACTIVE; 1240 if (file) 1241 lru += LRU_FILE; 1242 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, 1243 mode, file); 1244} 1245 1246/* 1247 * clear_active_flags() is a helper for shrink_active_list(), clearing 1248 * any active bits from the pages in the list. 1249 */ 1250static unsigned long clear_active_flags(struct list_head *page_list, 1251 unsigned int *count) 1252{ 1253 int nr_active = 0; 1254 int lru; 1255 struct page *page; 1256 1257 list_for_each_entry(page, page_list, lru) { 1258 int numpages = hpage_nr_pages(page); 1259 lru = page_lru_base_type(page); 1260 if (PageActive(page)) { 1261 lru += LRU_ACTIVE; 1262 ClearPageActive(page); 1263 nr_active += numpages; 1264 } 1265 if (count) 1266 count[lru] += numpages; 1267 } 1268 1269 return nr_active; 1270} 1271 1272/** 1273 * isolate_lru_page - tries to isolate a page from its LRU list 1274 * @page: page to isolate from its LRU list 1275 * 1276 * Isolates a @page from an LRU list, clears PageLRU and adjusts the 1277 * vmstat statistic corresponding to whatever LRU list the page was on. 1278 * 1279 * Returns 0 if the page was removed from an LRU list. 1280 * Returns -EBUSY if the page was not on an LRU list. 1281 * 1282 * The returned page will have PageLRU() cleared. If it was found on 1283 * the active list, it will have PageActive set. If it was found on 1284 * the unevictable list, it will have the PageUnevictable bit set. That flag 1285 * may need to be cleared by the caller before letting the page go. 1286 * 1287 * The vmstat statistic corresponding to the list on which the page was 1288 * found will be decremented. 1289 * 1290 * Restrictions: 1291 * (1) Must be called with an elevated refcount on the page. This is a 1292 * fundamentnal difference from isolate_lru_pages (which is called 1293 * without a stable reference). 1294 * (2) the lru_lock must not be held. 1295 * (3) interrupts must be enabled. 1296 */ 1297int isolate_lru_page(struct page *page) 1298{ 1299 int ret = -EBUSY; 1300 1301 VM_BUG_ON(!page_count(page)); 1302 1303 if (PageLRU(page)) { 1304 struct zone *zone = page_zone(page); 1305 1306 spin_lock_irq(&zone->lru_lock); 1307 if (PageLRU(page)) { 1308 int lru = page_lru(page); 1309 ret = 0; 1310 get_page(page); 1311 ClearPageLRU(page); 1312 1313 del_page_from_lru_list(zone, page, lru); 1314 } 1315 spin_unlock_irq(&zone->lru_lock); 1316 } 1317 return ret; 1318} 1319 1320/* 1321 * Are there way too many processes in the direct reclaim path already? 1322 */ 1323static int too_many_isolated(struct zone *zone, int file, 1324 struct scan_control *sc) 1325{ 1326 unsigned long inactive, isolated; 1327 1328 if (current_is_kswapd()) 1329 return 0; 1330 1331 if (!scanning_global_lru(sc)) 1332 return 0; 1333 1334 if (file) { 1335 inactive = zone_page_state(zone, NR_INACTIVE_FILE); 1336 isolated = zone_page_state(zone, NR_ISOLATED_FILE); 1337 } else { 1338 inactive = zone_page_state(zone, NR_INACTIVE_ANON); 1339 isolated = zone_page_state(zone, NR_ISOLATED_ANON); 1340 } 1341 1342 return isolated > inactive; 1343} 1344 1345/* 1346 * TODO: Try merging with migrations version of putback_lru_pages 1347 */ 1348static noinline_for_stack void 1349putback_lru_pages(struct zone *zone, struct scan_control *sc, 1350 unsigned long nr_anon, unsigned long nr_file, 1351 struct list_head *page_list) 1352{ 1353 struct page *page; 1354 struct pagevec pvec; 1355 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1356 1357 pagevec_init(&pvec, 1); 1358 1359 /* 1360 * Put back any unfreeable pages. 1361 */ 1362 spin_lock(&zone->lru_lock); 1363 while (!list_empty(page_list)) { 1364 int lru; 1365 page = lru_to_page(page_list); 1366 VM_BUG_ON(PageLRU(page)); 1367 list_del(&page->lru); 1368 if (unlikely(!page_evictable(page, NULL))) { 1369 spin_unlock_irq(&zone->lru_lock); 1370 putback_lru_page(page); 1371 spin_lock_irq(&zone->lru_lock); 1372 continue; 1373 } 1374 SetPageLRU(page); 1375 lru = page_lru(page); 1376 add_page_to_lru_list(zone, page, lru); 1377 if (is_active_lru(lru)) { 1378 int file = is_file_lru(lru); 1379 int numpages = hpage_nr_pages(page); 1380 reclaim_stat->recent_rotated[file] += numpages; 1381 } 1382 if (!pagevec_add(&pvec, page)) { 1383 spin_unlock_irq(&zone->lru_lock); 1384 __pagevec_release(&pvec); 1385 spin_lock_irq(&zone->lru_lock); 1386 } 1387 } 1388 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); 1389 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); 1390 1391 spin_unlock_irq(&zone->lru_lock); 1392 pagevec_release(&pvec); 1393} 1394 1395static noinline_for_stack void update_isolated_counts(struct zone *zone, 1396 struct scan_control *sc, 1397 unsigned long *nr_anon, 1398 unsigned long *nr_file, 1399 struct list_head *isolated_list) 1400{ 1401 unsigned long nr_active; 1402 unsigned int count[NR_LRU_LISTS] = { 0, }; 1403 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1404 1405 nr_active = clear_active_flags(isolated_list, count); 1406 __count_vm_events(PGDEACTIVATE, nr_active); 1407 1408 __mod_zone_page_state(zone, NR_ACTIVE_FILE, 1409 -count[LRU_ACTIVE_FILE]); 1410 __mod_zone_page_state(zone, NR_INACTIVE_FILE, 1411 -count[LRU_INACTIVE_FILE]); 1412 __mod_zone_page_state(zone, NR_ACTIVE_ANON, 1413 -count[LRU_ACTIVE_ANON]); 1414 __mod_zone_page_state(zone, NR_INACTIVE_ANON, 1415 -count[LRU_INACTIVE_ANON]); 1416 1417 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; 1418 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; 1419 __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); 1420 __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); 1421 1422 reclaim_stat->recent_scanned[0] += *nr_anon; 1423 reclaim_stat->recent_scanned[1] += *nr_file; 1424} 1425 1426/* 1427 * Returns true if a direct reclaim should wait on pages under writeback. 1428 * 1429 * If we are direct reclaiming for contiguous pages and we do not reclaim 1430 * everything in the list, try again and wait for writeback IO to complete. 1431 * This will stall high-order allocations noticeably. Only do that when really 1432 * need to free the pages under high memory pressure. 1433 */ 1434static inline bool should_reclaim_stall(unsigned long nr_taken, 1435 unsigned long nr_freed, 1436 int priority, 1437 struct scan_control *sc) 1438{ 1439 int lumpy_stall_priority; 1440 1441 /* kswapd should not stall on sync IO */ 1442 if (current_is_kswapd()) 1443 return false; 1444 1445 /* Only stall on lumpy reclaim */ 1446 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) 1447 return false; 1448 1449 /* If we have reclaimed everything on the isolated list, no stall */ 1450 if (nr_freed == nr_taken) 1451 return false; 1452 1453 /* 1454 * For high-order allocations, there are two stall thresholds. 1455 * High-cost allocations stall immediately where as lower 1456 * order allocations such as stacks require the scanning 1457 * priority to be much higher before stalling. 1458 */ 1459 if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 1460 lumpy_stall_priority = DEF_PRIORITY; 1461 else 1462 lumpy_stall_priority = DEF_PRIORITY / 3; 1463 1464 return priority <= lumpy_stall_priority; 1465} 1466 1467/* 1468 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1469 * of reclaimed pages 1470 */ 1471static noinline_for_stack unsigned long 1472shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, 1473 struct scan_control *sc, int priority, int file) 1474{ 1475 LIST_HEAD(page_list); 1476 unsigned long nr_scanned; 1477 unsigned long nr_reclaimed = 0; 1478 unsigned long nr_taken; 1479 unsigned long nr_anon; 1480 unsigned long nr_file; 1481 unsigned long nr_dirty = 0; 1482 unsigned long nr_writeback = 0; 1483 isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; 1484 1485 while (unlikely(too_many_isolated(zone, file, sc))) { 1486 congestion_wait(BLK_RW_ASYNC, HZ/10); 1487 1488 /* We are about to die and free our memory. Return now. */ 1489 if (fatal_signal_pending(current)) 1490 return SWAP_CLUSTER_MAX; 1491 } 1492 1493 set_reclaim_mode(priority, sc, false); 1494 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) 1495 reclaim_mode |= ISOLATE_ACTIVE; 1496 1497 lru_add_drain(); 1498 1499 if (!sc->may_unmap) 1500 reclaim_mode |= ISOLATE_UNMAPPED; 1501 if (!sc->may_writepage) 1502 reclaim_mode |= ISOLATE_CLEAN; 1503 1504 spin_lock_irq(&zone->lru_lock); 1505 1506 if (scanning_global_lru(sc)) { 1507 nr_taken = isolate_pages_global(nr_to_scan, &page_list, 1508 &nr_scanned, sc->order, reclaim_mode, zone, 0, file); 1509 zone->pages_scanned += nr_scanned; 1510 if (current_is_kswapd()) 1511 __count_zone_vm_events(PGSCAN_KSWAPD, zone, 1512 nr_scanned); 1513 else 1514 __count_zone_vm_events(PGSCAN_DIRECT, zone, 1515 nr_scanned); 1516 } else { 1517 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, 1518 &nr_scanned, sc->order, reclaim_mode, zone, 1519 sc->mem_cgroup, 0, file); 1520 /* 1521 * mem_cgroup_isolate_pages() keeps track of 1522 * scanned pages on its own. 1523 */ 1524 } 1525 1526 if (nr_taken == 0) { 1527 spin_unlock_irq(&zone->lru_lock); 1528 return 0; 1529 } 1530 1531 update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list); 1532 1533 spin_unlock_irq(&zone->lru_lock); 1534 1535 nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority, 1536 &nr_dirty, &nr_writeback); 1537 1538 /* Check if we should syncronously wait for writeback */ 1539 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { 1540 set_reclaim_mode(priority, sc, true); 1541 nr_reclaimed += shrink_page_list(&page_list, zone, sc, 1542 priority, &nr_dirty, &nr_writeback); 1543 } 1544 1545 local_irq_disable(); 1546 if (current_is_kswapd()) 1547 __count_vm_events(KSWAPD_STEAL, nr_reclaimed); 1548 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); 1549 1550 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); 1551 1552 /* 1553 * If reclaim is isolating dirty pages under writeback, it implies 1554 * that the long-lived page allocation rate is exceeding the page 1555 * laundering rate. Either the global limits are not being effective 1556 * at throttling processes due to the page distribution throughout 1557 * zones or there is heavy usage of a slow backing device. The 1558 * only option is to throttle from reclaim context which is not ideal 1559 * as there is no guarantee the dirtying process is throttled in the 1560 * same way balance_dirty_pages() manages. 1561 * 1562 * This scales the number of dirty pages that must be under writeback 1563 * before throttling depending on priority. It is a simple backoff 1564 * function that has the most effect in the range DEF_PRIORITY to 1565 * DEF_PRIORITY-2 which is the priority reclaim is considered to be 1566 * in trouble and reclaim is considered to be in trouble. 1567 * 1568 * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle 1569 * DEF_PRIORITY-1 50% must be PageWriteback 1570 * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble 1571 * ... 1572 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any 1573 * isolated page is PageWriteback 1574 */ 1575 if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority))) 1576 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); 1577 1578 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1579 zone_idx(zone), 1580 nr_scanned, nr_reclaimed, 1581 priority, 1582 trace_shrink_flags(file, sc->reclaim_mode)); 1583 return nr_reclaimed; 1584} 1585 1586/* 1587 * This moves pages from the active list to the inactive list. 1588 * 1589 * We move them the other way if the page is referenced by one or more 1590 * processes, from rmap. 1591 * 1592 * If the pages are mostly unmapped, the processing is fast and it is 1593 * appropriate to hold zone->lru_lock across the whole operation. But if 1594 * the pages are mapped, the processing is slow (page_referenced()) so we 1595 * should drop zone->lru_lock around each page. It's impossible to balance 1596 * this, so instead we remove the pages from the LRU while processing them. 1597 * It is safe to rely on PG_active against the non-LRU pages in here because 1598 * nobody will play with that bit on a non-LRU page. 1599 * 1600 * The downside is that we have to touch page->_count against each page. 1601 * But we had to alter page->flags anyway. 1602 */ 1603 1604static void move_active_pages_to_lru(struct zone *zone, 1605 struct list_head *list, 1606 enum lru_list lru) 1607{ 1608 unsigned long pgmoved = 0; 1609 struct pagevec pvec; 1610 struct page *page; 1611 1612 pagevec_init(&pvec, 1); 1613 1614 while (!list_empty(list)) { 1615 page = lru_to_page(list); 1616 1617 VM_BUG_ON(PageLRU(page)); 1618 SetPageLRU(page); 1619 1620 list_move(&page->lru, &zone->lru[lru].list); 1621 mem_cgroup_add_lru_list(page, lru); 1622 pgmoved += hpage_nr_pages(page); 1623 1624 if (!pagevec_add(&pvec, page) || list_empty(list)) { 1625 spin_unlock_irq(&zone->lru_lock); 1626 if (buffer_heads_over_limit) 1627 pagevec_strip(&pvec); 1628 __pagevec_release(&pvec); 1629 spin_lock_irq(&zone->lru_lock); 1630 } 1631 } 1632 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); 1633 if (!is_active_lru(lru)) 1634 __count_vm_events(PGDEACTIVATE, pgmoved); 1635} 1636 1637static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1638 struct scan_control *sc, int priority, int file) 1639{ 1640 unsigned long nr_taken; 1641 unsigned long pgscanned; 1642 unsigned long vm_flags; 1643 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1644 LIST_HEAD(l_active); 1645 LIST_HEAD(l_inactive); 1646 struct page *page; 1647 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1648 unsigned long nr_rotated = 0; 1649 isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; 1650 1651 lru_add_drain(); 1652 1653 if (!sc->may_unmap) 1654 reclaim_mode |= ISOLATE_UNMAPPED; 1655 if (!sc->may_writepage) 1656 reclaim_mode |= ISOLATE_CLEAN; 1657 1658 spin_lock_irq(&zone->lru_lock); 1659 if (scanning_global_lru(sc)) { 1660 nr_taken = isolate_pages_global(nr_pages, &l_hold, 1661 &pgscanned, sc->order, 1662 reclaim_mode, zone, 1663 1, file); 1664 zone->pages_scanned += pgscanned; 1665 } else { 1666 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, 1667 &pgscanned, sc->order, 1668 reclaim_mode, zone, 1669 sc->mem_cgroup, 1, file); 1670 /* 1671 * mem_cgroup_isolate_pages() keeps track of 1672 * scanned pages on its own. 1673 */ 1674 } 1675 1676 reclaim_stat->recent_scanned[file] += nr_taken; 1677 1678 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1679 if (file) 1680 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); 1681 else 1682 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken); 1683 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); 1684 spin_unlock_irq(&zone->lru_lock); 1685 1686 while (!list_empty(&l_hold)) { 1687 cond_resched(); 1688 page = lru_to_page(&l_hold); 1689 list_del(&page->lru); 1690 1691 if (unlikely(!page_evictable(page, NULL))) { 1692 putback_lru_page(page); 1693 continue; 1694 } 1695 1696 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { 1697 nr_rotated += hpage_nr_pages(page); 1698 /* 1699 * Identify referenced, file-backed active pages and 1700 * give them one more trip around the active list. So 1701 * that executable code get better chances to stay in 1702 * memory under moderate memory pressure. Anon pages 1703 * are not likely to be evicted by use-once streaming 1704 * IO, plus JVM can create lots of anon VM_EXEC pages, 1705 * so we ignore them here. 1706 */ 1707 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { 1708 list_add(&page->lru, &l_active); 1709 continue; 1710 } 1711 } 1712 1713 ClearPageActive(page); /* we are de-activating */ 1714 list_add(&page->lru, &l_inactive); 1715 } 1716 1717 /* 1718 * Move pages back to the lru list. 1719 */ 1720 spin_lock_irq(&zone->lru_lock); 1721 /* 1722 * Count referenced pages from currently used mappings as rotated, 1723 * even though only some of them are actually re-activated. This 1724 * helps balance scan pressure between file and anonymous pages in 1725 * get_scan_ratio. 1726 */ 1727 reclaim_stat->recent_rotated[file] += nr_rotated; 1728 1729 move_active_pages_to_lru(zone, &l_active, 1730 LRU_ACTIVE + file * LRU_FILE); 1731 move_active_pages_to_lru(zone, &l_inactive, 1732 LRU_BASE + file * LRU_FILE); 1733 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); 1734 spin_unlock_irq(&zone->lru_lock); 1735} 1736 1737#ifdef CONFIG_SWAP 1738static int inactive_anon_is_low_global(struct zone *zone) 1739{ 1740 unsigned long active, inactive; 1741 1742 active = zone_page_state(zone, NR_ACTIVE_ANON); 1743 inactive = zone_page_state(zone, NR_INACTIVE_ANON); 1744 1745 if (inactive * zone->inactive_ratio < active) 1746 return 1; 1747 1748 return 0; 1749} 1750 1751/** 1752 * inactive_anon_is_low - check if anonymous pages need to be deactivated 1753 * @zone: zone to check 1754 * @sc: scan control of this context 1755 * 1756 * Returns true if the zone does not have enough inactive anon pages, 1757 * meaning some active anon pages need to be deactivated. 1758 */ 1759static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) 1760{ 1761 int low; 1762 1763 /* 1764 * If we don't have swap space, anonymous page deactivation 1765 * is pointless. 1766 */ 1767 if (!total_swap_pages) 1768 return 0; 1769 1770 if (scanning_global_lru(sc)) 1771 low = inactive_anon_is_low_global(zone); 1772 else 1773 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone); 1774 return low; 1775} 1776#else 1777static inline int inactive_anon_is_low(struct zone *zone, 1778 struct scan_control *sc) 1779{ 1780 return 0; 1781} 1782#endif 1783 1784static int inactive_file_is_low_global(struct zone *zone) 1785{ 1786 unsigned long active, inactive; 1787 1788 active = zone_page_state(zone, NR_ACTIVE_FILE); 1789 inactive = zone_page_state(zone, NR_INACTIVE_FILE); 1790 1791 return (active > inactive); 1792} 1793 1794/** 1795 * inactive_file_is_low - check if file pages need to be deactivated 1796 * @zone: zone to check 1797 * @sc: scan control of this context 1798 * 1799 * When the system is doing streaming IO, memory pressure here 1800 * ensures that active file pages get deactivated, until more 1801 * than half of the file pages are on the inactive list. 1802 * 1803 * Once we get to that situation, protect the system's working 1804 * set from being evicted by disabling active file page aging. 1805 * 1806 * This uses a different ratio than the anonymous pages, because 1807 * the page cache uses a use-once replacement algorithm. 1808 */ 1809static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) 1810{ 1811 int low; 1812 1813 if (scanning_global_lru(sc)) 1814 low = inactive_file_is_low_global(zone); 1815 else 1816 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone); 1817 return low; 1818} 1819 1820static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, 1821 int file) 1822{ 1823 if (file) 1824 return inactive_file_is_low(zone, sc); 1825 else 1826 return inactive_anon_is_low(zone, sc); 1827} 1828 1829static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1830 struct zone *zone, struct scan_control *sc, int priority) 1831{ 1832 int file = is_file_lru(lru); 1833 1834 if (is_active_lru(lru)) { 1835 if (inactive_list_is_low(zone, sc, file)) 1836 shrink_active_list(nr_to_scan, zone, sc, priority, file); 1837 return 0; 1838 } 1839 1840 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); 1841} 1842 1843static int vmscan_swappiness(struct scan_control *sc) 1844{ 1845 if (scanning_global_lru(sc)) 1846 return vm_swappiness; 1847 return mem_cgroup_swappiness(sc->mem_cgroup); 1848} 1849 1850/* 1851 * Determine how aggressively the anon and file LRU lists should be 1852 * scanned. The relative value of each set of LRU lists is determined 1853 * by looking at the fraction of the pages scanned we did rotate back 1854 * onto the active list instead of evict. 1855 * 1856 * nr[0] = anon pages to scan; nr[1] = file pages to scan 1857 */ 1858static void get_scan_count(struct zone *zone, struct scan_control *sc, 1859 unsigned long *nr, int priority) 1860{ 1861 unsigned long anon, file, free; 1862 unsigned long anon_prio, file_prio; 1863 unsigned long ap, fp; 1864 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1865 u64 fraction[2], denominator; 1866 enum lru_list l; 1867 int noswap = 0; 1868 bool force_scan = false; 1869 1870 /* 1871 * If the zone or memcg is small, nr[l] can be 0. This 1872 * results in no scanning on this priority and a potential 1873 * priority drop. Global direct reclaim can go to the next 1874 * zone and tends to have no problems. Global kswapd is for 1875 * zone balancing and it needs to scan a minimum amount. When 1876 * reclaiming for a memcg, a priority drop can cause high 1877 * latencies, so it's better to scan a minimum amount there as 1878 * well. 1879 */ 1880 if (scanning_global_lru(sc) && current_is_kswapd()) 1881 force_scan = true; 1882 if (!scanning_global_lru(sc)) 1883 force_scan = true; 1884 1885 /* If we have no swap space, do not bother scanning anon pages. */ 1886 if (!sc->may_swap || (nr_swap_pages <= 0)) { 1887 noswap = 1; 1888 fraction[0] = 0; 1889 fraction[1] = 1; 1890 denominator = 1; 1891 goto out; 1892 } 1893 1894 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + 1895 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); 1896 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + 1897 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); 1898 1899 if (scanning_global_lru(sc)) { 1900 free = zone_page_state(zone, NR_FREE_PAGES); 1901 /* If we have very few page cache pages, 1902 force-scan anon pages. */ 1903 if (unlikely(file + free <= high_wmark_pages(zone))) { 1904 fraction[0] = 1; 1905 fraction[1] = 0; 1906 denominator = 1; 1907 goto out; 1908 } 1909 } 1910 1911 /* 1912 * With swappiness at 100, anonymous and file have the same priority. 1913 * This scanning priority is essentially the inverse of IO cost. 1914 */ 1915 anon_prio = vmscan_swappiness(sc); 1916 file_prio = 200 - vmscan_swappiness(sc); 1917 1918 /* 1919 * OK, so we have swap space and a fair amount of page cache 1920 * pages. We use the recently rotated / recently scanned 1921 * ratios to determine how valuable each cache is. 1922 * 1923 * Because workloads change over time (and to avoid overflow) 1924 * we keep these statistics as a floating average, which ends 1925 * up weighing recent references more than old ones. 1926 * 1927 * anon in [0], file in [1] 1928 */ 1929 spin_lock_irq(&zone->lru_lock); 1930 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 1931 reclaim_stat->recent_scanned[0] /= 2; 1932 reclaim_stat->recent_rotated[0] /= 2; 1933 } 1934 1935 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { 1936 reclaim_stat->recent_scanned[1] /= 2; 1937 reclaim_stat->recent_rotated[1] /= 2; 1938 } 1939 1940 /* 1941 * The amount of pressure on anon vs file pages is inversely 1942 * proportional to the fraction of recently scanned pages on 1943 * each list that were recently referenced and in active use. 1944 */ 1945 ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); 1946 ap /= reclaim_stat->recent_rotated[0] + 1; 1947 1948 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); 1949 fp /= reclaim_stat->recent_rotated[1] + 1; 1950 spin_unlock_irq(&zone->lru_lock); 1951 1952 fraction[0] = ap; 1953 fraction[1] = fp; 1954 denominator = ap + fp + 1; 1955out: 1956 for_each_evictable_lru(l) { 1957 int file = is_file_lru(l); 1958 unsigned long scan; 1959 1960 scan = zone_nr_lru_pages(zone, sc, l); 1961 if (priority || noswap) { 1962 scan >>= priority; 1963 if (!scan && force_scan) 1964 scan = SWAP_CLUSTER_MAX; 1965 scan = div64_u64(scan * fraction[file], denominator); 1966 } 1967 nr[l] = scan; 1968 } 1969} 1970 1971/* 1972 * Reclaim/compaction depends on a number of pages being freed. To avoid 1973 * disruption to the system, a small number of order-0 pages continue to be 1974 * rotated and reclaimed in the normal fashion. However, by the time we get 1975 * back to the allocator and call try_to_compact_zone(), we ensure that 1976 * there are enough free pages for it to be likely successful 1977 */ 1978static inline bool should_continue_reclaim(struct zone *zone, 1979 unsigned long nr_reclaimed, 1980 unsigned long nr_scanned, 1981 struct scan_control *sc) 1982{ 1983 unsigned long pages_for_compaction; 1984 unsigned long inactive_lru_pages; 1985 1986 /* If not in reclaim/compaction mode, stop */ 1987 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) 1988 return false; 1989 1990 /* Consider stopping depending on scan and reclaim activity */ 1991 if (sc->gfp_mask & __GFP_REPEAT) { 1992 /* 1993 * For __GFP_REPEAT allocations, stop reclaiming if the 1994 * full LRU list has been scanned and we are still failing 1995 * to reclaim pages. This full LRU scan is potentially 1996 * expensive but a __GFP_REPEAT caller really wants to succeed 1997 */ 1998 if (!nr_reclaimed && !nr_scanned) 1999 return false; 2000 } else { 2001 /* 2002 * For non-__GFP_REPEAT allocations which can presumably 2003 * fail without consequence, stop if we failed to reclaim 2004 * any pages from the last SWAP_CLUSTER_MAX number of 2005 * pages that were scanned. This will return to the 2006 * caller faster at the risk reclaim/compaction and 2007 * the resulting allocation attempt fails 2008 */ 2009 if (!nr_reclaimed) 2010 return false; 2011 } 2012 2013 /* 2014 * If we have not reclaimed enough pages for compaction and the 2015 * inactive lists are large enough, continue reclaiming 2016 */ 2017 pages_for_compaction = (2UL << sc->order); 2018 inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + 2019 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); 2020 if (sc->nr_reclaimed < pages_for_compaction && 2021 inactive_lru_pages > pages_for_compaction) 2022 return true; 2023 2024 /* If compaction would go ahead or the allocation would succeed, stop */ 2025 switch (compaction_suitable(zone, sc->order)) { 2026 case COMPACT_PARTIAL: 2027 case COMPACT_CONTINUE: 2028 return false; 2029 default: 2030 return true; 2031 } 2032} 2033 2034/* 2035 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 2036 */ 2037static void shrink_zone(int priority, struct zone *zone, 2038 struct scan_control *sc) 2039{ 2040 unsigned long nr[NR_LRU_LISTS]; 2041 unsigned long nr_to_scan; 2042 enum lru_list l; 2043 unsigned long nr_reclaimed, nr_scanned; 2044 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 2045 struct blk_plug plug; 2046 2047restart: 2048 nr_reclaimed = 0; 2049 nr_scanned = sc->nr_scanned; 2050 get_scan_count(zone, sc, nr, priority); 2051 2052 blk_start_plug(&plug); 2053 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 2054 nr[LRU_INACTIVE_FILE]) { 2055 for_each_evictable_lru(l) { 2056 if (nr[l]) { 2057 nr_to_scan = min_t(unsigned long, 2058 nr[l], SWAP_CLUSTER_MAX); 2059 nr[l] -= nr_to_scan; 2060 2061 nr_reclaimed += shrink_list(l, nr_to_scan, 2062 zone, sc, priority); 2063 } 2064 } 2065 /* 2066 * On large memory systems, scan >> priority can become 2067 * really large. This is fine for the starting priority; 2068 * we want to put equal scanning pressure on each zone. 2069 * However, if the VM has a harder time of freeing pages, 2070 * with multiple processes reclaiming pages, the total 2071 * freeing target can get unreasonably large. 2072 */ 2073 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) 2074 break; 2075 } 2076 blk_finish_plug(&plug); 2077 sc->nr_reclaimed += nr_reclaimed; 2078 2079 /* 2080 * Even if we did not try to evict anon pages at all, we want to 2081 * rebalance the anon lru active/inactive ratio. 2082 */ 2083 if (inactive_anon_is_low(zone, sc)) 2084 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 2085 2086 /* reclaim/compaction might need reclaim to continue */ 2087 if (should_continue_reclaim(zone, nr_reclaimed, 2088 sc->nr_scanned - nr_scanned, sc)) 2089 goto restart; 2090 2091 throttle_vm_writeout(sc->gfp_mask); 2092} 2093 2094/* 2095 * This is the direct reclaim path, for page-allocating processes. We only 2096 * try to reclaim pages from zones which will satisfy the caller's allocation 2097 * request. 2098 * 2099 * We reclaim from a zone even if that zone is over high_wmark_pages(zone). 2100 * Because: 2101 * a) The caller may be trying to free *extra* pages to satisfy a higher-order 2102 * allocation or 2103 * b) The target zone may be at high_wmark_pages(zone) but the lower zones 2104 * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' 2105 * zone defense algorithm. 2106 * 2107 * If a zone is deemed to be full of pinned pages then just give it a light 2108 * scan then give up on it. 2109 * 2110 * This function returns true if a zone is being reclaimed for a costly 2111 * high-order allocation and compaction is either ready to begin or deferred. 2112 * This indicates to the caller that it should retry the allocation or fail. 2113 */ 2114static bool shrink_zones(int priority, struct zonelist *zonelist, 2115 struct scan_control *sc) 2116{ 2117 struct zoneref *z; 2118 struct zone *zone; 2119 unsigned long nr_soft_reclaimed; 2120 unsigned long nr_soft_scanned; 2121 bool should_abort_reclaim = false; 2122 2123 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2124 gfp_zone(sc->gfp_mask), sc->nodemask) { 2125 if (!populated_zone(zone)) 2126 continue; 2127 /* 2128 * Take care memory controller reclaiming has small influence 2129 * to global LRU. 2130 */ 2131 if (scanning_global_lru(sc)) { 2132 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2133 continue; 2134 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2135 continue; /* Let kswapd poll it */ 2136 if (COMPACTION_BUILD) { 2137 /* 2138 * If we already have plenty of memory free for 2139 * compaction in this zone, don't free any more. 2140 * Even though compaction is invoked for any 2141 * non-zero order, only frequent costly order 2142 * reclamation is disruptive enough to become a 2143 * noticable problem, like transparent huge page 2144 * allocations. 2145 */ 2146 if (sc->order > PAGE_ALLOC_COSTLY_ORDER && 2147 (compaction_suitable(zone, sc->order) || 2148 compaction_deferred(zone))) { 2149 should_abort_reclaim = true; 2150 continue; 2151 } 2152 } 2153 /* 2154 * This steals pages from memory cgroups over softlimit 2155 * and returns the number of reclaimed pages and 2156 * scanned pages. This works for global memory pressure 2157 * and balancing, not for a memcg's limit. 2158 */ 2159 nr_soft_scanned = 0; 2160 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, 2161 sc->order, sc->gfp_mask, 2162 &nr_soft_scanned); 2163 sc->nr_reclaimed += nr_soft_reclaimed; 2164 sc->nr_scanned += nr_soft_scanned; 2165 /* need some check for avoid more shrink_zone() */ 2166 } 2167 2168 shrink_zone(priority, zone, sc); 2169 } 2170 2171 return should_abort_reclaim; 2172} 2173 2174static bool zone_reclaimable(struct zone *zone) 2175{ 2176 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; 2177} 2178 2179/* All zones in zonelist are unreclaimable? */ 2180static bool all_unreclaimable(struct zonelist *zonelist, 2181 struct scan_control *sc) 2182{ 2183 struct zoneref *z; 2184 struct zone *zone; 2185 2186 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2187 gfp_zone(sc->gfp_mask), sc->nodemask) { 2188 if (!populated_zone(zone)) 2189 continue; 2190 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2191 continue; 2192 if (!zone->all_unreclaimable) 2193 return false; 2194 } 2195 2196 return true; 2197} 2198 2199/* 2200 * This is the main entry point to direct page reclaim. 2201 * 2202 * If a full scan of the inactive list fails to free enough memory then we 2203 * are "out of memory" and something needs to be killed. 2204 * 2205 * If the caller is !__GFP_FS then the probability of a failure is reasonably 2206 * high - the zone may be full of dirty or under-writeback pages, which this 2207 * caller can't do much about. We kick the writeback threads and take explicit 2208 * naps in the hope that some of these pages can be written. But if the 2209 * allocating task holds filesystem locks which prevent writeout this might not 2210 * work, and the allocation attempt will fail. 2211 * 2212 * returns: 0, if no pages reclaimed 2213 * else, the number of pages reclaimed 2214 */ 2215static unsigned long do_try_to_free_pages(struct zonelist *zonelist, 2216 struct scan_control *sc, 2217 struct shrink_control *shrink) 2218{ 2219 int priority; 2220 unsigned long total_scanned = 0; 2221 struct reclaim_state *reclaim_state = current->reclaim_state; 2222 struct zoneref *z; 2223 struct zone *zone; 2224 unsigned long writeback_threshold; 2225 2226 get_mems_allowed(); 2227 delayacct_freepages_start(); 2228 2229 if (scanning_global_lru(sc)) 2230 count_vm_event(ALLOCSTALL); 2231 2232 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2233 sc->nr_scanned = 0; 2234 if (!priority) 2235 disable_swap_token(sc->mem_cgroup); 2236 if (shrink_zones(priority, zonelist, sc)) 2237 break; 2238 2239 /* 2240 * Don't shrink slabs when reclaiming memory from 2241 * over limit cgroups 2242 */ 2243 if (scanning_global_lru(sc)) { 2244 unsigned long lru_pages = 0; 2245 for_each_zone_zonelist(zone, z, zonelist, 2246 gfp_zone(sc->gfp_mask)) { 2247 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2248 continue; 2249 2250 lru_pages += zone_reclaimable_pages(zone); 2251 } 2252 2253 shrink_slab(shrink, sc->nr_scanned, lru_pages); 2254 if (reclaim_state) { 2255 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 2256 reclaim_state->reclaimed_slab = 0; 2257 } 2258 } 2259 total_scanned += sc->nr_scanned; 2260 if (sc->nr_reclaimed >= sc->nr_to_reclaim) 2261 goto out; 2262 2263 /* 2264 * Try to write back as many pages as we just scanned. This 2265 * tends to cause slow streaming writers to write data to the 2266 * disk smoothly, at the dirtying rate, which is nice. But 2267 * that's undesirable in laptop mode, where we *want* lumpy 2268 * writeout. So in laptop mode, write out the whole world. 2269 */ 2270 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; 2271 if (total_scanned > writeback_threshold) { 2272 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned, 2273 WB_REASON_TRY_TO_FREE_PAGES); 2274 sc->may_writepage = 1; 2275 } 2276 2277 /* Take a nap, wait for some writeback to complete */ 2278 if (!sc->hibernation_mode && sc->nr_scanned && 2279 priority < DEF_PRIORITY - 2) { 2280 struct zone *preferred_zone; 2281 2282 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), 2283 &cpuset_current_mems_allowed, 2284 &preferred_zone); 2285 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); 2286 } 2287 } 2288 2289out: 2290 delayacct_freepages_end(); 2291 put_mems_allowed(); 2292 2293 if (sc->nr_reclaimed) 2294 return sc->nr_reclaimed; 2295 2296 /* 2297 * As hibernation is going on, kswapd is freezed so that it can't mark 2298 * the zone into all_unreclaimable. Thus bypassing all_unreclaimable 2299 * check. 2300 */ 2301 if (oom_killer_disabled) 2302 return 0; 2303 2304 /* top priority shrink_zones still had more to do? don't OOM, then */ 2305 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) 2306 return 1; 2307 2308 return 0; 2309} 2310 2311unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 2312 gfp_t gfp_mask, nodemask_t *nodemask) 2313{ 2314 unsigned long nr_reclaimed; 2315 struct scan_control sc = { 2316 .gfp_mask = gfp_mask, 2317 .may_writepage = !laptop_mode, 2318 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2319 .may_unmap = 1, 2320 .may_swap = 1, 2321 .order = order, 2322 .mem_cgroup = NULL, 2323 .nodemask = nodemask, 2324 }; 2325 struct shrink_control shrink = { 2326 .gfp_mask = sc.gfp_mask, 2327 }; 2328 2329 trace_mm_vmscan_direct_reclaim_begin(order, 2330 sc.may_writepage, 2331 gfp_mask); 2332 2333 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 2334 2335 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); 2336 2337 return nr_reclaimed; 2338} 2339 2340#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2341 2342unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 2343 gfp_t gfp_mask, bool noswap, 2344 struct zone *zone, 2345 unsigned long *nr_scanned) 2346{ 2347 struct scan_control sc = { 2348 .nr_scanned = 0, 2349 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2350 .may_writepage = !laptop_mode, 2351 .may_unmap = 1, 2352 .may_swap = !noswap, 2353 .order = 0, 2354 .mem_cgroup = mem, 2355 }; 2356 2357 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2358 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2359 2360 trace_mm_vmscan_memcg_softlimit_reclaim_begin(0, 2361 sc.may_writepage, 2362 sc.gfp_mask); 2363 2364 /* 2365 * NOTE: Although we can get the priority field, using it 2366 * here is not a good idea, since it limits the pages we can scan. 2367 * if we don't reclaim here, the shrink_zone from balance_pgdat 2368 * will pick up pages from other mem cgroup's as well. We hack 2369 * the priority and make it zero. 2370 */ 2371 shrink_zone(0, zone, &sc); 2372 2373 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2374 2375 *nr_scanned = sc.nr_scanned; 2376 return sc.nr_reclaimed; 2377} 2378 2379unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 2380 gfp_t gfp_mask, 2381 bool noswap) 2382{ 2383 struct zonelist *zonelist; 2384 unsigned long nr_reclaimed; 2385 int nid; 2386 struct scan_control sc = { 2387 .may_writepage = !laptop_mode, 2388 .may_unmap = 1, 2389 .may_swap = !noswap, 2390 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2391 .order = 0, 2392 .mem_cgroup = mem_cont, 2393 .nodemask = NULL, /* we don't care the placement */ 2394 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2395 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), 2396 }; 2397 struct shrink_control shrink = { 2398 .gfp_mask = sc.gfp_mask, 2399 }; 2400 2401 /* 2402 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't 2403 * take care of from where we get pages. So the node where we start the 2404 * scan does not need to be the current node. 2405 */ 2406 nid = mem_cgroup_select_victim_node(mem_cont); 2407 2408 zonelist = NODE_DATA(nid)->node_zonelists; 2409 2410 trace_mm_vmscan_memcg_reclaim_begin(0, 2411 sc.may_writepage, 2412 sc.gfp_mask); 2413 2414 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 2415 2416 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); 2417 2418 return nr_reclaimed; 2419} 2420#endif 2421 2422/* 2423 * pgdat_balanced is used when checking if a node is balanced for high-order 2424 * allocations. Only zones that meet watermarks and are in a zone allowed 2425 * by the callers classzone_idx are added to balanced_pages. The total of 2426 * balanced pages must be at least 25% of the zones allowed by classzone_idx 2427 * for the node to be considered balanced. Forcing all zones to be balanced 2428 * for high orders can cause excessive reclaim when there are imbalanced zones. 2429 * The choice of 25% is due to 2430 * o a 16M DMA zone that is balanced will not balance a zone on any 2431 * reasonable sized machine 2432 * o On all other machines, the top zone must be at least a reasonable 2433 * percentage of the middle zones. For example, on 32-bit x86, highmem 2434 * would need to be at least 256M for it to be balance a whole node. 2435 * Similarly, on x86-64 the Normal zone would need to be at least 1G 2436 * to balance a node on its own. These seemed like reasonable ratios. 2437 */ 2438static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, 2439 int classzone_idx) 2440{ 2441 unsigned long present_pages = 0; 2442 int i; 2443 2444 for (i = 0; i <= classzone_idx; i++) 2445 present_pages += pgdat->node_zones[i].present_pages; 2446 2447 /* A special case here: if zone has no page, we think it's balanced */ 2448 return balanced_pages >= (present_pages >> 2); 2449} 2450 2451/* is kswapd sleeping prematurely? */ 2452static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, 2453 int classzone_idx) 2454{ 2455 int i; 2456 unsigned long balanced = 0; 2457 bool all_zones_ok = true; 2458 2459 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2460 if (remaining) 2461 return true; 2462 2463 /* Check the watermark levels */ 2464 for (i = 0; i <= classzone_idx; i++) { 2465 struct zone *zone = pgdat->node_zones + i; 2466 2467 if (!populated_zone(zone)) 2468 continue; 2469 2470 /* 2471 * balance_pgdat() skips over all_unreclaimable after 2472 * DEF_PRIORITY. Effectively, it considers them balanced so 2473 * they must be considered balanced here as well if kswapd 2474 * is to sleep 2475 */ 2476 if (zone->all_unreclaimable) { 2477 balanced += zone->present_pages; 2478 continue; 2479 } 2480 2481 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 2482 i, 0)) 2483 all_zones_ok = false; 2484 else 2485 balanced += zone->present_pages; 2486 } 2487 2488 /* 2489 * For high-order requests, the balanced zones must contain at least 2490 * 25% of the nodes pages for kswapd to sleep. For order-0, all zones 2491 * must be balanced 2492 */ 2493 if (order) 2494 return !pgdat_balanced(pgdat, balanced, classzone_idx); 2495 else 2496 return !all_zones_ok; 2497} 2498 2499/* 2500 * For kswapd, balance_pgdat() will work across all this node's zones until 2501 * they are all at high_wmark_pages(zone). 2502 * 2503 * Returns the final order kswapd was reclaiming at 2504 * 2505 * There is special handling here for zones which are full of pinned pages. 2506 * This can happen if the pages are all mlocked, or if they are all used by 2507 * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. 2508 * What we do is to detect the case where all pages in the zone have been 2509 * scanned twice and there has been zero successful reclaim. Mark the zone as 2510 * dead and from now on, only perform a short scan. Basically we're polling 2511 * the zone for when the problem goes away. 2512 * 2513 * kswapd scans the zones in the highmem->normal->dma direction. It skips 2514 * zones which have free_pages > high_wmark_pages(zone), but once a zone is 2515 * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the 2516 * lower zones regardless of the number of free pages in the lower zones. This 2517 * interoperates with the page allocator fallback scheme to ensure that aging 2518 * of pages is balanced across the zones. 2519 */ 2520static unsigned long balance_pgdat(pg_data_t *pgdat, int order, 2521 int *classzone_idx) 2522{ 2523 int all_zones_ok; 2524 unsigned long balanced; 2525 int priority; 2526 int i; 2527 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2528 unsigned long total_scanned; 2529 struct reclaim_state *reclaim_state = current->reclaim_state; 2530 unsigned long nr_soft_reclaimed; 2531 unsigned long nr_soft_scanned; 2532 struct scan_control sc = { 2533 .gfp_mask = GFP_KERNEL, 2534 .may_unmap = 1, 2535 .may_swap = 1, 2536 /* 2537 * kswapd doesn't want to be bailed out while reclaim. because 2538 * we want to put equal scanning pressure on each zone. 2539 */ 2540 .nr_to_reclaim = ULONG_MAX, 2541 .order = order, 2542 .mem_cgroup = NULL, 2543 }; 2544 struct shrink_control shrink = { 2545 .gfp_mask = sc.gfp_mask, 2546 }; 2547loop_again: 2548 total_scanned = 0; 2549 sc.nr_reclaimed = 0; 2550 sc.may_writepage = !laptop_mode; 2551 count_vm_event(PAGEOUTRUN); 2552 2553 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2554 unsigned long lru_pages = 0; 2555 int has_under_min_watermark_zone = 0; 2556 2557 /* The swap token gets in the way of swapout... */ 2558 if (!priority) 2559 disable_swap_token(NULL); 2560 2561 all_zones_ok = 1; 2562 balanced = 0; 2563 2564 /* 2565 * Scan in the highmem->dma direction for the highest 2566 * zone which needs scanning 2567 */ 2568 for (i = pgdat->nr_zones - 1; i >= 0; i--) { 2569 struct zone *zone = pgdat->node_zones + i; 2570 2571 if (!populated_zone(zone)) 2572 continue; 2573 2574 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2575 continue; 2576 2577 /* 2578 * Do some background aging of the anon list, to give 2579 * pages a chance to be referenced before reclaiming. 2580 */ 2581 if (inactive_anon_is_low(zone, &sc)) 2582 shrink_active_list(SWAP_CLUSTER_MAX, zone, 2583 &sc, priority, 0); 2584 2585 if (!zone_watermark_ok_safe(zone, order, 2586 high_wmark_pages(zone), 0, 0)) { 2587 end_zone = i; 2588 break; 2589 } else { 2590 /* If balanced, clear the congested flag */ 2591 zone_clear_flag(zone, ZONE_CONGESTED); 2592 } 2593 } 2594 if (i < 0) 2595 goto out; 2596 2597 for (i = 0; i <= end_zone; i++) { 2598 struct zone *zone = pgdat->node_zones + i; 2599 2600 lru_pages += zone_reclaimable_pages(zone); 2601 } 2602 2603 /* 2604 * Now scan the zone in the dma->highmem direction, stopping 2605 * at the last zone which needs scanning. 2606 * 2607 * We do this because the page allocator works in the opposite 2608 * direction. This prevents the page allocator from allocating 2609 * pages behind kswapd's direction of progress, which would 2610 * cause too much scanning of the lower zones. 2611 */ 2612 for (i = 0; i <= end_zone; i++) { 2613 struct zone *zone = pgdat->node_zones + i; 2614 int nr_slab; 2615 unsigned long balance_gap; 2616 2617 if (!populated_zone(zone)) 2618 continue; 2619 2620 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2621 continue; 2622 2623 sc.nr_scanned = 0; 2624 2625 nr_soft_scanned = 0; 2626 /* 2627 * Call soft limit reclaim before calling shrink_zone. 2628 */ 2629 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, 2630 order, sc.gfp_mask, 2631 &nr_soft_scanned); 2632 sc.nr_reclaimed += nr_soft_reclaimed; 2633 total_scanned += nr_soft_scanned; 2634 2635 /* 2636 * We put equal pressure on every zone, unless 2637 * one zone has way too many pages free 2638 * already. The "too many pages" is defined 2639 * as the high wmark plus a "gap" where the 2640 * gap is either the low watermark or 1% 2641 * of the zone, whichever is smaller. 2642 */ 2643 balance_gap = min(low_wmark_pages(zone), 2644 (zone->present_pages + 2645 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2646 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2647 if (!zone_watermark_ok_safe(zone, order, 2648 high_wmark_pages(zone) + balance_gap, 2649 end_zone, 0)) { 2650 shrink_zone(priority, zone, &sc); 2651 2652 reclaim_state->reclaimed_slab = 0; 2653 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); 2654 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 2655 total_scanned += sc.nr_scanned; 2656 2657 if (nr_slab == 0 && !zone_reclaimable(zone)) 2658 zone->all_unreclaimable = 1; 2659 } 2660 2661 /* 2662 * If we've done a decent amount of scanning and 2663 * the reclaim ratio is low, start doing writepage 2664 * even in laptop mode 2665 */ 2666 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 2667 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2668 sc.may_writepage = 1; 2669 2670 if (zone->all_unreclaimable) { 2671 if (end_zone && end_zone == i) 2672 end_zone--; 2673 continue; 2674 } 2675 2676 if (!zone_watermark_ok_safe(zone, order, 2677 high_wmark_pages(zone), end_zone, 0)) { 2678 all_zones_ok = 0; 2679 /* 2680 * We are still under min water mark. This 2681 * means that we have a GFP_ATOMIC allocation 2682 * failure risk. Hurry up! 2683 */ 2684 if (!zone_watermark_ok_safe(zone, order, 2685 min_wmark_pages(zone), end_zone, 0)) 2686 has_under_min_watermark_zone = 1; 2687 } else { 2688 /* 2689 * If a zone reaches its high watermark, 2690 * consider it to be no longer congested. It's 2691 * possible there are dirty pages backed by 2692 * congested BDIs but as pressure is relieved, 2693 * spectulatively avoid congestion waits 2694 */ 2695 zone_clear_flag(zone, ZONE_CONGESTED); 2696 if (i <= *classzone_idx) 2697 balanced += zone->present_pages; 2698 } 2699 2700 } 2701 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) 2702 break; /* kswapd: all done */ 2703 /* 2704 * OK, kswapd is getting into trouble. Take a nap, then take 2705 * another pass across the zones. 2706 */ 2707 if (total_scanned && (priority < DEF_PRIORITY - 2)) { 2708 if (has_under_min_watermark_zone) 2709 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); 2710 else 2711 congestion_wait(BLK_RW_ASYNC, HZ/10); 2712 } 2713 2714 /* 2715 * We do this so kswapd doesn't build up large priorities for 2716 * example when it is freeing in parallel with allocators. It 2717 * matches the direct reclaim path behaviour in terms of impact 2718 * on zone->*_priority. 2719 */ 2720 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) 2721 break; 2722 } 2723out: 2724 2725 /* 2726 * order-0: All zones must meet high watermark for a balanced node 2727 * high-order: Balanced zones must make up at least 25% of the node 2728 * for the node to be balanced 2729 */ 2730 if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { 2731 cond_resched(); 2732 2733 try_to_freeze(); 2734 2735 /* 2736 * Fragmentation may mean that the system cannot be 2737 * rebalanced for high-order allocations in all zones. 2738 * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, 2739 * it means the zones have been fully scanned and are still 2740 * not balanced. For high-order allocations, there is 2741 * little point trying all over again as kswapd may 2742 * infinite loop. 2743 * 2744 * Instead, recheck all watermarks at order-0 as they 2745 * are the most important. If watermarks are ok, kswapd will go 2746 * back to sleep. High-order users can still perform direct 2747 * reclaim if they wish. 2748 */ 2749 if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) 2750 order = sc.order = 0; 2751 2752 goto loop_again; 2753 } 2754 2755 /* 2756 * If kswapd was reclaiming at a higher order, it has the option of 2757 * sleeping without all zones being balanced. Before it does, it must 2758 * ensure that the watermarks for order-0 on *all* zones are met and 2759 * that the congestion flags are cleared. The congestion flag must 2760 * be cleared as kswapd is the only mechanism that clears the flag 2761 * and it is potentially going to sleep here. 2762 */ 2763 if (order) { 2764 for (i = 0; i <= end_zone; i++) { 2765 struct zone *zone = pgdat->node_zones + i; 2766 2767 if (!populated_zone(zone)) 2768 continue; 2769 2770 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2771 continue; 2772 2773 /* Confirm the zone is balanced for order-0 */ 2774 if (!zone_watermark_ok(zone, 0, 2775 high_wmark_pages(zone), 0, 0)) { 2776 order = sc.order = 0; 2777 goto loop_again; 2778 } 2779 2780 /* If balanced, clear the congested flag */ 2781 zone_clear_flag(zone, ZONE_CONGESTED); 2782 if (i <= *classzone_idx) 2783 balanced += zone->present_pages; 2784 } 2785 } 2786 2787 /* 2788 * Return the order we were reclaiming at so sleeping_prematurely() 2789 * makes a decision on the order we were last reclaiming at. However, 2790 * if another caller entered the allocator slow path while kswapd 2791 * was awake, order will remain at the higher level 2792 */ 2793 *classzone_idx = end_zone; 2794 return order; 2795} 2796 2797static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) 2798{ 2799 long remaining = 0; 2800 DEFINE_WAIT(wait); 2801 2802 if (freezing(current) || kthread_should_stop()) 2803 return; 2804 2805 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2806 2807 /* Try to sleep for a short interval */ 2808 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { 2809 remaining = schedule_timeout(HZ/10); 2810 finish_wait(&pgdat->kswapd_wait, &wait); 2811 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2812 } 2813 2814 /* 2815 * After a short sleep, check if it was a premature sleep. If not, then 2816 * go fully to sleep until explicitly woken up. 2817 */ 2818 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { 2819 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 2820 2821 /* 2822 * vmstat counters are not perfectly accurate and the estimated 2823 * value for counters such as NR_FREE_PAGES can deviate from the 2824 * true value by nr_online_cpus * threshold. To avoid the zone 2825 * watermarks being breached while under pressure, we reduce the 2826 * per-cpu vmstat threshold while kswapd is awake and restore 2827 * them before going back to sleep. 2828 */ 2829 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 2830 schedule(); 2831 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); 2832 } else { 2833 if (remaining) 2834 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); 2835 else 2836 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); 2837 } 2838 finish_wait(&pgdat->kswapd_wait, &wait); 2839} 2840 2841/* 2842 * The background pageout daemon, started as a kernel thread 2843 * from the init process. 2844 * 2845 * This basically trickles out pages so that we have _some_ 2846 * free memory available even if there is no other activity 2847 * that frees anything up. This is needed for things like routing 2848 * etc, where we otherwise might have all activity going on in 2849 * asynchronous contexts that cannot page things out. 2850 * 2851 * If there are applications that are active memory-allocators 2852 * (most normal use), this basically shouldn't matter. 2853 */ 2854static int kswapd(void *p) 2855{ 2856 unsigned long order, new_order; 2857 unsigned balanced_order; 2858 int classzone_idx, new_classzone_idx; 2859 int balanced_classzone_idx; 2860 pg_data_t *pgdat = (pg_data_t*)p; 2861 struct task_struct *tsk = current; 2862 2863 struct reclaim_state reclaim_state = { 2864 .reclaimed_slab = 0, 2865 }; 2866 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 2867 2868 lockdep_set_current_reclaim_state(GFP_KERNEL); 2869 2870 if (!cpumask_empty(cpumask)) 2871 set_cpus_allowed_ptr(tsk, cpumask); 2872 current->reclaim_state = &reclaim_state; 2873 2874 /* 2875 * Tell the memory management that we're a "memory allocator", 2876 * and that if we need more memory we should get access to it 2877 * regardless (see "__alloc_pages()"). "kswapd" should 2878 * never get caught in the normal page freeing logic. 2879 * 2880 * (Kswapd normally doesn't need memory anyway, but sometimes 2881 * you need a small amount of memory in order to be able to 2882 * page out something else, and this flag essentially protects 2883 * us from recursively trying to free more memory as we're 2884 * trying to free the first piece of memory in the first place). 2885 */ 2886 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 2887 set_freezable(); 2888 2889 order = new_order = 0; 2890 balanced_order = 0; 2891 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; 2892 balanced_classzone_idx = classzone_idx; 2893 for ( ; ; ) { 2894 int ret; 2895 2896 /* 2897 * If the last balance_pgdat was unsuccessful it's unlikely a 2898 * new request of a similar or harder type will succeed soon 2899 * so consider going to sleep on the basis we reclaimed at 2900 */ 2901 if (balanced_classzone_idx >= new_classzone_idx && 2902 balanced_order == new_order) { 2903 new_order = pgdat->kswapd_max_order; 2904 new_classzone_idx = pgdat->classzone_idx; 2905 pgdat->kswapd_max_order = 0; 2906 pgdat->classzone_idx = pgdat->nr_zones - 1; 2907 } 2908 2909 if (order < new_order || classzone_idx > new_classzone_idx) { 2910 /* 2911 * Don't sleep if someone wants a larger 'order' 2912 * allocation or has tigher zone constraints 2913 */ 2914 order = new_order; 2915 classzone_idx = new_classzone_idx; 2916 } else { 2917 kswapd_try_to_sleep(pgdat, balanced_order, 2918 balanced_classzone_idx); 2919 order = pgdat->kswapd_max_order; 2920 classzone_idx = pgdat->classzone_idx; 2921 new_order = order; 2922 new_classzone_idx = classzone_idx; 2923 pgdat->kswapd_max_order = 0; 2924 pgdat->classzone_idx = pgdat->nr_zones - 1; 2925 } 2926 2927 ret = try_to_freeze(); 2928 if (kthread_should_stop()) 2929 break; 2930 2931 /* 2932 * We can speed up thawing tasks if we don't call balance_pgdat 2933 * after returning from the refrigerator 2934 */ 2935 if (!ret) { 2936 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); 2937 balanced_classzone_idx = classzone_idx; 2938 balanced_order = balance_pgdat(pgdat, order, 2939 &balanced_classzone_idx); 2940 } 2941 } 2942 return 0; 2943} 2944 2945/* 2946 * A zone is low on free memory, so wake its kswapd task to service it. 2947 */ 2948void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) 2949{ 2950 pg_data_t *pgdat; 2951 2952 if (!populated_zone(zone)) 2953 return; 2954 2955 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2956 return; 2957 pgdat = zone->zone_pgdat; 2958 if (pgdat->kswapd_max_order < order) { 2959 pgdat->kswapd_max_order = order; 2960 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); 2961 } 2962 if (!waitqueue_active(&pgdat->kswapd_wait)) 2963 return; 2964 if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) 2965 return; 2966 2967 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); 2968 wake_up_interruptible(&pgdat->kswapd_wait); 2969} 2970 2971/* 2972 * The reclaimable count would be mostly accurate. 2973 * The less reclaimable pages may be 2974 * - mlocked pages, which will be moved to unevictable list when encountered 2975 * - mapped pages, which may require several travels to be reclaimed 2976 * - dirty pages, which is not "instantly" reclaimable 2977 */ 2978unsigned long global_reclaimable_pages(void) 2979{ 2980 int nr; 2981 2982 nr = global_page_state(NR_ACTIVE_FILE) + 2983 global_page_state(NR_INACTIVE_FILE); 2984 2985 if (nr_swap_pages > 0) 2986 nr += global_page_state(NR_ACTIVE_ANON) + 2987 global_page_state(NR_INACTIVE_ANON); 2988 2989 return nr; 2990} 2991 2992unsigned long zone_reclaimable_pages(struct zone *zone) 2993{ 2994 int nr; 2995 2996 nr = zone_page_state(zone, NR_ACTIVE_FILE) + 2997 zone_page_state(zone, NR_INACTIVE_FILE); 2998 2999 if (nr_swap_pages > 0) 3000 nr += zone_page_state(zone, NR_ACTIVE_ANON) + 3001 zone_page_state(zone, NR_INACTIVE_ANON); 3002 3003 return nr; 3004} 3005 3006#ifdef CONFIG_HIBERNATION 3007/* 3008 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of 3009 * freed pages. 3010 * 3011 * Rather than trying to age LRUs the aim is to preserve the overall 3012 * LRU order by reclaiming preferentially 3013 * inactive > active > active referenced > active mapped 3014 */ 3015unsigned long shrink_all_memory(unsigned long nr_to_reclaim) 3016{ 3017 struct reclaim_state reclaim_state; 3018 struct scan_control sc = { 3019 .gfp_mask = GFP_HIGHUSER_MOVABLE, 3020 .may_swap = 1, 3021 .may_unmap = 1, 3022 .may_writepage = 1, 3023 .nr_to_reclaim = nr_to_reclaim, 3024 .hibernation_mode = 1, 3025 .order = 0, 3026 }; 3027 struct shrink_control shrink = { 3028 .gfp_mask = sc.gfp_mask, 3029 }; 3030 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 3031 struct task_struct *p = current; 3032 unsigned long nr_reclaimed; 3033 3034 p->flags |= PF_MEMALLOC; 3035 lockdep_set_current_reclaim_state(sc.gfp_mask); 3036 reclaim_state.reclaimed_slab = 0; 3037 p->reclaim_state = &reclaim_state; 3038 3039 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 3040 3041 p->reclaim_state = NULL; 3042 lockdep_clear_current_reclaim_state(); 3043 p->flags &= ~PF_MEMALLOC; 3044 3045 return nr_reclaimed; 3046} 3047#endif /* CONFIG_HIBERNATION */ 3048 3049/* It's optimal to keep kswapds on the same CPUs as their memory, but 3050 not required for correctness. So if the last cpu in a node goes 3051 away, we get changed to run anywhere: as the first one comes back, 3052 restore their cpu bindings. */ 3053static int __devinit cpu_callback(struct notifier_block *nfb, 3054 unsigned long action, void *hcpu) 3055{ 3056 int nid; 3057 3058 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { 3059 for_each_node_state(nid, N_HIGH_MEMORY) { 3060 pg_data_t *pgdat = NODE_DATA(nid); 3061 const struct cpumask *mask; 3062 3063 mask = cpumask_of_node(pgdat->node_id); 3064 3065 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) 3066 /* One of our CPUs online: restore mask */ 3067 set_cpus_allowed_ptr(pgdat->kswapd, mask); 3068 } 3069 } 3070 return NOTIFY_OK; 3071} 3072 3073/* 3074 * This kswapd start function will be called by init and node-hot-add. 3075 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. 3076 */ 3077int kswapd_run(int nid) 3078{ 3079 pg_data_t *pgdat = NODE_DATA(nid); 3080 int ret = 0; 3081 3082 if (pgdat->kswapd) 3083 return 0; 3084 3085 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); 3086 if (IS_ERR(pgdat->kswapd)) { 3087 /* failure at boot is fatal */ 3088 BUG_ON(system_state == SYSTEM_BOOTING); 3089 printk("Failed to start kswapd on node %d\n",nid); 3090 ret = -1; 3091 } 3092 return ret; 3093} 3094 3095/* 3096 * Called by memory hotplug when all memory in a node is offlined. 3097 */ 3098void kswapd_stop(int nid) 3099{ 3100 struct task_struct *kswapd = NODE_DATA(nid)->kswapd; 3101 3102 if (kswapd) 3103 kthread_stop(kswapd); 3104} 3105 3106static int __init kswapd_init(void) 3107{ 3108 int nid; 3109 3110 swap_setup(); 3111 for_each_node_state(nid, N_HIGH_MEMORY) 3112 kswapd_run(nid); 3113 hotcpu_notifier(cpu_callback, 0); 3114 return 0; 3115} 3116 3117module_init(kswapd_init) 3118 3119#ifdef CONFIG_NUMA 3120/* 3121 * Zone reclaim mode 3122 * 3123 * If non-zero call zone_reclaim when the number of free pages falls below 3124 * the watermarks. 3125 */ 3126int zone_reclaim_mode __read_mostly; 3127 3128#define RECLAIM_OFF 0 3129#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ 3130#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ 3131#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ 3132 3133/* 3134 * Priority for ZONE_RECLAIM. This determines the fraction of pages 3135 * of a node considered for each zone_reclaim. 4 scans 1/16th of 3136 * a zone. 3137 */ 3138#define ZONE_RECLAIM_PRIORITY 4 3139 3140/* 3141 * Percentage of pages in a zone that must be unmapped for zone_reclaim to 3142 * occur. 3143 */ 3144int sysctl_min_unmapped_ratio = 1; 3145 3146/* 3147 * If the number of slab pages in a zone grows beyond this percentage then 3148 * slab reclaim needs to occur. 3149 */ 3150int sysctl_min_slab_ratio = 5; 3151 3152static inline unsigned long zone_unmapped_file_pages(struct zone *zone) 3153{ 3154 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); 3155 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + 3156 zone_page_state(zone, NR_ACTIVE_FILE); 3157 3158 /* 3159 * It's possible for there to be more file mapped pages than 3160 * accounted for by the pages on the file LRU lists because 3161 * tmpfs pages accounted for as ANON can also be FILE_MAPPED 3162 */ 3163 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; 3164} 3165 3166/* Work out how many page cache pages we can reclaim in this reclaim_mode */ 3167static long zone_pagecache_reclaimable(struct zone *zone) 3168{ 3169 long nr_pagecache_reclaimable; 3170 long delta = 0; 3171 3172 /* 3173 * If RECLAIM_SWAP is set, then all file pages are considered 3174 * potentially reclaimable. Otherwise, we have to worry about 3175 * pages like swapcache and zone_unmapped_file_pages() provides 3176 * a better estimate 3177 */ 3178 if (zone_reclaim_mode & RECLAIM_SWAP) 3179 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); 3180 else 3181 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); 3182 3183 /* If we can't clean pages, remove dirty pages from consideration */ 3184 if (!(zone_reclaim_mode & RECLAIM_WRITE)) 3185 delta += zone_page_state(zone, NR_FILE_DIRTY); 3186 3187 /* Watch for any possible underflows due to delta */ 3188 if (unlikely(delta > nr_pagecache_reclaimable)) 3189 delta = nr_pagecache_reclaimable; 3190 3191 return nr_pagecache_reclaimable - delta; 3192} 3193 3194/* 3195 * Try to free up some pages from this zone through reclaim. 3196 */ 3197static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 3198{ 3199 /* Minimum pages needed in order to stay on node */ 3200 const unsigned long nr_pages = 1 << order; 3201 struct task_struct *p = current; 3202 struct reclaim_state reclaim_state; 3203 int priority; 3204 struct scan_control sc = { 3205 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 3206 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 3207 .may_swap = 1, 3208 .nr_to_reclaim = max_t(unsigned long, nr_pages, 3209 SWAP_CLUSTER_MAX), 3210 .gfp_mask = gfp_mask, 3211 .order = order, 3212 }; 3213 struct shrink_control shrink = { 3214 .gfp_mask = sc.gfp_mask, 3215 }; 3216 unsigned long nr_slab_pages0, nr_slab_pages1; 3217 3218 cond_resched(); 3219 /* 3220 * We need to be able to allocate from the reserves for RECLAIM_SWAP 3221 * and we also need to be able to write out pages for RECLAIM_WRITE 3222 * and RECLAIM_SWAP. 3223 */ 3224 p->flags |= PF_MEMALLOC | PF_SWAPWRITE; 3225 lockdep_set_current_reclaim_state(gfp_mask); 3226 reclaim_state.reclaimed_slab = 0; 3227 p->reclaim_state = &reclaim_state; 3228 3229 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { 3230 /* 3231 * Free memory by calling shrink zone with increasing 3232 * priorities until we have enough memory freed. 3233 */ 3234 priority = ZONE_RECLAIM_PRIORITY; 3235 do { 3236 shrink_zone(priority, zone, &sc); 3237 priority--; 3238 } while (priority >= 0 && sc.nr_reclaimed < nr_pages); 3239 } 3240 3241 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 3242 if (nr_slab_pages0 > zone->min_slab_pages) { 3243 /* 3244 * shrink_slab() does not currently allow us to determine how 3245 * many pages were freed in this zone. So we take the current 3246 * number of slab pages and shake the slab until it is reduced 3247 * by the same nr_pages that we used for reclaiming unmapped 3248 * pages. 3249 * 3250 * Note that shrink_slab will free memory on all zones and may 3251 * take a long time. 3252 */ 3253 for (;;) { 3254 unsigned long lru_pages = zone_reclaimable_pages(zone); 3255 3256 /* No reclaimable slab or very low memory pressure */ 3257 if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages)) 3258 break; 3259 3260 /* Freed enough memory */ 3261 nr_slab_pages1 = zone_page_state(zone, 3262 NR_SLAB_RECLAIMABLE); 3263 if (nr_slab_pages1 + nr_pages <= nr_slab_pages0) 3264 break; 3265 } 3266 3267 /* 3268 * Update nr_reclaimed by the number of slab pages we 3269 * reclaimed from this zone. 3270 */ 3271 nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 3272 if (nr_slab_pages1 < nr_slab_pages0) 3273 sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1; 3274 } 3275 3276 p->reclaim_state = NULL; 3277 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 3278 lockdep_clear_current_reclaim_state(); 3279 return sc.nr_reclaimed >= nr_pages; 3280} 3281 3282int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 3283{ 3284 int node_id; 3285 int ret; 3286 3287 /* 3288 * Zone reclaim reclaims unmapped file backed pages and 3289 * slab pages if we are over the defined limits. 3290 * 3291 * A small portion of unmapped file backed pages is needed for 3292 * file I/O otherwise pages read by file I/O will be immediately 3293 * thrown out if the zone is overallocated. So we do not reclaim 3294 * if less than a specified percentage of the zone is used by 3295 * unmapped file backed pages. 3296 */ 3297 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && 3298 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) 3299 return ZONE_RECLAIM_FULL; 3300 3301 if (zone->all_unreclaimable) 3302 return ZONE_RECLAIM_FULL; 3303 3304 /* 3305 * Do not scan if the allocation should not be delayed. 3306 */ 3307 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) 3308 return ZONE_RECLAIM_NOSCAN; 3309 3310 /* 3311 * Only run zone reclaim on the local zone or on zones that do not 3312 * have associated processors. This will favor the local processor 3313 * over remote processors and spread off node memory allocations 3314 * as wide as possible. 3315 */ 3316 node_id = zone_to_nid(zone); 3317 if (node_state(node_id, N_CPU) && node_id != numa_node_id()) 3318 return ZONE_RECLAIM_NOSCAN; 3319 3320 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) 3321 return ZONE_RECLAIM_NOSCAN; 3322 3323 ret = __zone_reclaim(zone, gfp_mask, order); 3324 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); 3325 3326 if (!ret) 3327 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); 3328 3329 return ret; 3330} 3331#endif 3332 3333/* 3334 * page_evictable - test whether a page is evictable 3335 * @page: the page to test 3336 * @vma: the VMA in which the page is or will be mapped, may be NULL 3337 * 3338 * Test whether page is evictable--i.e., should be placed on active/inactive 3339 * lists vs unevictable list. The vma argument is !NULL when called from the 3340 * fault path to determine how to instantate a new page. 3341 * 3342 * Reasons page might not be evictable: 3343 * (1) page's mapping marked unevictable 3344 * (2) page is part of an mlocked VMA 3345 * 3346 */ 3347int page_evictable(struct page *page, struct vm_area_struct *vma) 3348{ 3349 3350 if (mapping_unevictable(page_mapping(page))) 3351 return 0; 3352 3353 if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) 3354 return 0; 3355 3356 return 1; 3357} 3358 3359/** 3360 * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list 3361 * @page: page to check evictability and move to appropriate lru list 3362 * @zone: zone page is in 3363 * 3364 * Checks a page for evictability and moves the page to the appropriate 3365 * zone lru list. 3366 * 3367 * Restrictions: zone->lru_lock must be held, page must be on LRU and must 3368 * have PageUnevictable set. 3369 */ 3370static void check_move_unevictable_page(struct page *page, struct zone *zone) 3371{ 3372 VM_BUG_ON(PageActive(page)); 3373 3374retry: 3375 ClearPageUnevictable(page); 3376 if (page_evictable(page, NULL)) { 3377 enum lru_list l = page_lru_base_type(page); 3378 3379 __dec_zone_state(zone, NR_UNEVICTABLE); 3380 list_move(&page->lru, &zone->lru[l].list); 3381 mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l); 3382 __inc_zone_state(zone, NR_INACTIVE_ANON + l); 3383 __count_vm_event(UNEVICTABLE_PGRESCUED); 3384 } else { 3385 /* 3386 * rotate unevictable list 3387 */ 3388 SetPageUnevictable(page); 3389 list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); 3390 mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE); 3391 if (page_evictable(page, NULL)) 3392 goto retry; 3393 } 3394} 3395 3396/** 3397 * scan_mapping_unevictable_pages - scan an address space for evictable pages 3398 * @mapping: struct address_space to scan for evictable pages 3399 * 3400 * Scan all pages in mapping. Check unevictable pages for 3401 * evictability and move them to the appropriate zone lru list. 3402 */ 3403void scan_mapping_unevictable_pages(struct address_space *mapping) 3404{ 3405 pgoff_t next = 0; 3406 pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >> 3407 PAGE_CACHE_SHIFT; 3408 struct zone *zone; 3409 struct pagevec pvec; 3410 3411 if (mapping->nrpages == 0) 3412 return; 3413 3414 pagevec_init(&pvec, 0); 3415 while (next < end && 3416 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 3417 int i; 3418 int pg_scanned = 0; 3419 3420 zone = NULL; 3421 3422 for (i = 0; i < pagevec_count(&pvec); i++) { 3423 struct page *page = pvec.pages[i]; 3424 pgoff_t page_index = page->index; 3425 struct zone *pagezone = page_zone(page); 3426 3427 pg_scanned++; 3428 if (page_index > next) 3429 next = page_index; 3430 next++; 3431 3432 if (pagezone != zone) { 3433 if (zone) 3434 spin_unlock_irq(&zone->lru_lock); 3435 zone = pagezone; 3436 spin_lock_irq(&zone->lru_lock); 3437 } 3438 3439 if (PageLRU(page) && PageUnevictable(page)) 3440 check_move_unevictable_page(page, zone); 3441 } 3442 if (zone) 3443 spin_unlock_irq(&zone->lru_lock); 3444 pagevec_release(&pvec); 3445 3446 count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned); 3447 } 3448 3449} 3450 3451static void warn_scan_unevictable_pages(void) 3452{ 3453 printk_once(KERN_WARNING 3454 "The scan_unevictable_pages sysctl/node-interface has been " 3455 "disabled for lack of a legitimate use case. If you have " 3456 "one, please send an email to linux-mm@kvack.org.\n"); 3457} 3458 3459/* 3460 * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of 3461 * all nodes' unevictable lists for evictable pages 3462 */ 3463unsigned long scan_unevictable_pages; 3464 3465int scan_unevictable_handler(struct ctl_table *table, int write, 3466 void __user *buffer, 3467 size_t *length, loff_t *ppos) 3468{ 3469 warn_scan_unevictable_pages(); 3470 proc_doulongvec_minmax(table, write, buffer, length, ppos); 3471 scan_unevictable_pages = 0; 3472 return 0; 3473} 3474 3475#ifdef CONFIG_NUMA 3476/* 3477 * per node 'scan_unevictable_pages' attribute. On demand re-scan of 3478 * a specified node's per zone unevictable lists for evictable pages. 3479 */ 3480 3481static ssize_t read_scan_unevictable_node(struct sys_device *dev, 3482 struct sysdev_attribute *attr, 3483 char *buf) 3484{ 3485 warn_scan_unevictable_pages(); 3486 return sprintf(buf, "0\n"); /* always zero; should fit... */ 3487} 3488 3489static ssize_t write_scan_unevictable_node(struct sys_device *dev, 3490 struct sysdev_attribute *attr, 3491 const char *buf, size_t count) 3492{ 3493 warn_scan_unevictable_pages(); 3494 return 1; 3495} 3496 3497 3498static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, 3499 read_scan_unevictable_node, 3500 write_scan_unevictable_node); 3501 3502int scan_unevictable_register_node(struct node *node) 3503{ 3504 return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages); 3505} 3506 3507void scan_unevictable_unregister_node(struct node *node) 3508{ 3509 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); 3510} 3511#endif 3512