vmscan.c revision a18bba061c789f5815c3efc3c80e6ac269911964
1/* 2 * linux/mm/vmscan.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 * 6 * Swap reorganised 29.12.95, Stephen Tweedie. 7 * kswapd added: 7.1.96 sct 8 * Removed kswapd_ctl limits, and swap out as many pages as needed 9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel. 10 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). 11 * Multiqueue VM started 5.8.00, Rik van Riel. 12 */ 13 14#include <linux/mm.h> 15#include <linux/module.h> 16#include <linux/gfp.h> 17#include <linux/kernel_stat.h> 18#include <linux/swap.h> 19#include <linux/pagemap.h> 20#include <linux/init.h> 21#include <linux/highmem.h> 22#include <linux/vmstat.h> 23#include <linux/file.h> 24#include <linux/writeback.h> 25#include <linux/blkdev.h> 26#include <linux/buffer_head.h> /* for try_to_release_page(), 27 buffer_heads_over_limit */ 28#include <linux/mm_inline.h> 29#include <linux/pagevec.h> 30#include <linux/backing-dev.h> 31#include <linux/rmap.h> 32#include <linux/topology.h> 33#include <linux/cpu.h> 34#include <linux/cpuset.h> 35#include <linux/compaction.h> 36#include <linux/notifier.h> 37#include <linux/rwsem.h> 38#include <linux/delay.h> 39#include <linux/kthread.h> 40#include <linux/freezer.h> 41#include <linux/memcontrol.h> 42#include <linux/delayacct.h> 43#include <linux/sysctl.h> 44#include <linux/oom.h> 45#include <linux/prefetch.h> 46 47#include <asm/tlbflush.h> 48#include <asm/div64.h> 49 50#include <linux/swapops.h> 51 52#include "internal.h" 53 54#define CREATE_TRACE_POINTS 55#include <trace/events/vmscan.h> 56 57/* 58 * reclaim_mode determines how the inactive list is shrunk 59 * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages 60 * RECLAIM_MODE_ASYNC: Do not block 61 * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback 62 * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference 63 * page from the LRU and reclaim all pages within a 64 * naturally aligned range 65 * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of 66 * order-0 pages and then compact the zone 67 */ 68typedef unsigned __bitwise__ reclaim_mode_t; 69#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) 70#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) 71#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) 72#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) 73#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) 74 75struct scan_control { 76 /* Incremented by the number of inactive pages that were scanned */ 77 unsigned long nr_scanned; 78 79 /* Number of pages freed so far during a call to shrink_zones() */ 80 unsigned long nr_reclaimed; 81 82 /* How many pages shrink_list() should reclaim */ 83 unsigned long nr_to_reclaim; 84 85 unsigned long hibernation_mode; 86 87 /* This context's GFP mask */ 88 gfp_t gfp_mask; 89 90 int may_writepage; 91 92 /* Can mapped pages be reclaimed? */ 93 int may_unmap; 94 95 /* Can pages be swapped as part of reclaim? */ 96 int may_swap; 97 98 int order; 99 100 /* 101 * Intend to reclaim enough continuous memory rather than reclaim 102 * enough amount of memory. i.e, mode for high order allocation. 103 */ 104 reclaim_mode_t reclaim_mode; 105 106 /* Which cgroup do we reclaim from */ 107 struct mem_cgroup *mem_cgroup; 108 109 /* 110 * Nodemask of nodes allowed by the caller. If NULL, all nodes 111 * are scanned. 112 */ 113 nodemask_t *nodemask; 114}; 115 116#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 117 118#ifdef ARCH_HAS_PREFETCH 119#define prefetch_prev_lru_page(_page, _base, _field) \ 120 do { \ 121 if ((_page)->lru.prev != _base) { \ 122 struct page *prev; \ 123 \ 124 prev = lru_to_page(&(_page->lru)); \ 125 prefetch(&prev->_field); \ 126 } \ 127 } while (0) 128#else 129#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) 130#endif 131 132#ifdef ARCH_HAS_PREFETCHW 133#define prefetchw_prev_lru_page(_page, _base, _field) \ 134 do { \ 135 if ((_page)->lru.prev != _base) { \ 136 struct page *prev; \ 137 \ 138 prev = lru_to_page(&(_page->lru)); \ 139 prefetchw(&prev->_field); \ 140 } \ 141 } while (0) 142#else 143#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) 144#endif 145 146/* 147 * From 0 .. 100. Higher means more swappy. 148 */ 149int vm_swappiness = 60; 150long vm_total_pages; /* The total number of pages which the VM controls */ 151 152static LIST_HEAD(shrinker_list); 153static DECLARE_RWSEM(shrinker_rwsem); 154 155#ifdef CONFIG_CGROUP_MEM_RES_CTLR 156#define scanning_global_lru(sc) (!(sc)->mem_cgroup) 157#else 158#define scanning_global_lru(sc) (1) 159#endif 160 161static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, 162 struct scan_control *sc) 163{ 164 if (!scanning_global_lru(sc)) 165 return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone); 166 167 return &zone->reclaim_stat; 168} 169 170static unsigned long zone_nr_lru_pages(struct zone *zone, 171 struct scan_control *sc, enum lru_list lru) 172{ 173 if (!scanning_global_lru(sc)) 174 return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, 175 zone_to_nid(zone), zone_idx(zone), BIT(lru)); 176 177 return zone_page_state(zone, NR_LRU_BASE + lru); 178} 179 180 181/* 182 * Add a shrinker callback to be called from the vm 183 */ 184void register_shrinker(struct shrinker *shrinker) 185{ 186 shrinker->nr = 0; 187 down_write(&shrinker_rwsem); 188 list_add_tail(&shrinker->list, &shrinker_list); 189 up_write(&shrinker_rwsem); 190} 191EXPORT_SYMBOL(register_shrinker); 192 193/* 194 * Remove one 195 */ 196void unregister_shrinker(struct shrinker *shrinker) 197{ 198 down_write(&shrinker_rwsem); 199 list_del(&shrinker->list); 200 up_write(&shrinker_rwsem); 201} 202EXPORT_SYMBOL(unregister_shrinker); 203 204static inline int do_shrinker_shrink(struct shrinker *shrinker, 205 struct shrink_control *sc, 206 unsigned long nr_to_scan) 207{ 208 sc->nr_to_scan = nr_to_scan; 209 return (*shrinker->shrink)(shrinker, sc); 210} 211 212#define SHRINK_BATCH 128 213/* 214 * Call the shrink functions to age shrinkable caches 215 * 216 * Here we assume it costs one seek to replace a lru page and that it also 217 * takes a seek to recreate a cache object. With this in mind we age equal 218 * percentages of the lru and ageable caches. This should balance the seeks 219 * generated by these structures. 220 * 221 * If the vm encountered mapped pages on the LRU it increase the pressure on 222 * slab to avoid swapping. 223 * 224 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. 225 * 226 * `lru_pages' represents the number of on-LRU pages in all the zones which 227 * are eligible for the caller's allocation attempt. It is used for balancing 228 * slab reclaim versus page reclaim. 229 * 230 * Returns the number of slab objects which we shrunk. 231 */ 232unsigned long shrink_slab(struct shrink_control *shrink, 233 unsigned long nr_pages_scanned, 234 unsigned long lru_pages) 235{ 236 struct shrinker *shrinker; 237 unsigned long ret = 0; 238 239 if (nr_pages_scanned == 0) 240 nr_pages_scanned = SWAP_CLUSTER_MAX; 241 242 if (!down_read_trylock(&shrinker_rwsem)) { 243 /* Assume we'll be able to shrink next time */ 244 ret = 1; 245 goto out; 246 } 247 248 list_for_each_entry(shrinker, &shrinker_list, list) { 249 unsigned long long delta; 250 unsigned long total_scan; 251 unsigned long max_pass; 252 int shrink_ret = 0; 253 long nr; 254 long new_nr; 255 long batch_size = shrinker->batch ? shrinker->batch 256 : SHRINK_BATCH; 257 258 /* 259 * copy the current shrinker scan count into a local variable 260 * and zero it so that other concurrent shrinker invocations 261 * don't also do this scanning work. 262 */ 263 do { 264 nr = shrinker->nr; 265 } while (cmpxchg(&shrinker->nr, nr, 0) != nr); 266 267 total_scan = nr; 268 max_pass = do_shrinker_shrink(shrinker, shrink, 0); 269 delta = (4 * nr_pages_scanned) / shrinker->seeks; 270 delta *= max_pass; 271 do_div(delta, lru_pages + 1); 272 total_scan += delta; 273 if (total_scan < 0) { 274 printk(KERN_ERR "shrink_slab: %pF negative objects to " 275 "delete nr=%ld\n", 276 shrinker->shrink, total_scan); 277 total_scan = max_pass; 278 } 279 280 /* 281 * We need to avoid excessive windup on filesystem shrinkers 282 * due to large numbers of GFP_NOFS allocations causing the 283 * shrinkers to return -1 all the time. This results in a large 284 * nr being built up so when a shrink that can do some work 285 * comes along it empties the entire cache due to nr >>> 286 * max_pass. This is bad for sustaining a working set in 287 * memory. 288 * 289 * Hence only allow the shrinker to scan the entire cache when 290 * a large delta change is calculated directly. 291 */ 292 if (delta < max_pass / 4) 293 total_scan = min(total_scan, max_pass / 2); 294 295 /* 296 * Avoid risking looping forever due to too large nr value: 297 * never try to free more than twice the estimate number of 298 * freeable entries. 299 */ 300 if (total_scan > max_pass * 2) 301 total_scan = max_pass * 2; 302 303 trace_mm_shrink_slab_start(shrinker, shrink, nr, 304 nr_pages_scanned, lru_pages, 305 max_pass, delta, total_scan); 306 307 while (total_scan >= batch_size) { 308 int nr_before; 309 310 nr_before = do_shrinker_shrink(shrinker, shrink, 0); 311 shrink_ret = do_shrinker_shrink(shrinker, shrink, 312 batch_size); 313 if (shrink_ret == -1) 314 break; 315 if (shrink_ret < nr_before) 316 ret += nr_before - shrink_ret; 317 count_vm_events(SLABS_SCANNED, batch_size); 318 total_scan -= batch_size; 319 320 cond_resched(); 321 } 322 323 /* 324 * move the unused scan count back into the shrinker in a 325 * manner that handles concurrent updates. If we exhausted the 326 * scan, there is no need to do an update. 327 */ 328 do { 329 nr = shrinker->nr; 330 new_nr = total_scan + nr; 331 if (total_scan <= 0) 332 break; 333 } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); 334 335 trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); 336 } 337 up_read(&shrinker_rwsem); 338out: 339 cond_resched(); 340 return ret; 341} 342 343static void set_reclaim_mode(int priority, struct scan_control *sc, 344 bool sync) 345{ 346 reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; 347 348 /* 349 * Initially assume we are entering either lumpy reclaim or 350 * reclaim/compaction.Depending on the order, we will either set the 351 * sync mode or just reclaim order-0 pages later. 352 */ 353 if (COMPACTION_BUILD) 354 sc->reclaim_mode = RECLAIM_MODE_COMPACTION; 355 else 356 sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; 357 358 /* 359 * Avoid using lumpy reclaim or reclaim/compaction if possible by 360 * restricting when its set to either costly allocations or when 361 * under memory pressure 362 */ 363 if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 364 sc->reclaim_mode |= syncmode; 365 else if (sc->order && priority < DEF_PRIORITY - 2) 366 sc->reclaim_mode |= syncmode; 367 else 368 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; 369} 370 371static void reset_reclaim_mode(struct scan_control *sc) 372{ 373 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; 374} 375 376static inline int is_page_cache_freeable(struct page *page) 377{ 378 /* 379 * A freeable page cache page is referenced only by the caller 380 * that isolated the page, the page cache radix tree and 381 * optional buffer heads at page->private. 382 */ 383 return page_count(page) - page_has_private(page) == 2; 384} 385 386static int may_write_to_queue(struct backing_dev_info *bdi, 387 struct scan_control *sc) 388{ 389 if (current->flags & PF_SWAPWRITE) 390 return 1; 391 if (!bdi_write_congested(bdi)) 392 return 1; 393 if (bdi == current->backing_dev_info) 394 return 1; 395 396 /* lumpy reclaim for hugepage often need a lot of write */ 397 if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 398 return 1; 399 return 0; 400} 401 402/* 403 * We detected a synchronous write error writing a page out. Probably 404 * -ENOSPC. We need to propagate that into the address_space for a subsequent 405 * fsync(), msync() or close(). 406 * 407 * The tricky part is that after writepage we cannot touch the mapping: nothing 408 * prevents it from being freed up. But we have a ref on the page and once 409 * that page is locked, the mapping is pinned. 410 * 411 * We're allowed to run sleeping lock_page() here because we know the caller has 412 * __GFP_FS. 413 */ 414static void handle_write_error(struct address_space *mapping, 415 struct page *page, int error) 416{ 417 lock_page(page); 418 if (page_mapping(page) == mapping) 419 mapping_set_error(mapping, error); 420 unlock_page(page); 421} 422 423/* possible outcome of pageout() */ 424typedef enum { 425 /* failed to write page out, page is locked */ 426 PAGE_KEEP, 427 /* move page to the active list, page is locked */ 428 PAGE_ACTIVATE, 429 /* page has been sent to the disk successfully, page is unlocked */ 430 PAGE_SUCCESS, 431 /* page is clean and locked */ 432 PAGE_CLEAN, 433} pageout_t; 434 435/* 436 * pageout is called by shrink_page_list() for each dirty page. 437 * Calls ->writepage(). 438 */ 439static pageout_t pageout(struct page *page, struct address_space *mapping, 440 struct scan_control *sc) 441{ 442 /* 443 * If the page is dirty, only perform writeback if that write 444 * will be non-blocking. To prevent this allocation from being 445 * stalled by pagecache activity. But note that there may be 446 * stalls if we need to run get_block(). We could test 447 * PagePrivate for that. 448 * 449 * If this process is currently in __generic_file_aio_write() against 450 * this page's queue, we can perform writeback even if that 451 * will block. 452 * 453 * If the page is swapcache, write it back even if that would 454 * block, for some throttling. This happens by accident, because 455 * swap_backing_dev_info is bust: it doesn't reflect the 456 * congestion state of the swapdevs. Easy to fix, if needed. 457 */ 458 if (!is_page_cache_freeable(page)) 459 return PAGE_KEEP; 460 if (!mapping) { 461 /* 462 * Some data journaling orphaned pages can have 463 * page->mapping == NULL while being dirty with clean buffers. 464 */ 465 if (page_has_private(page)) { 466 if (try_to_free_buffers(page)) { 467 ClearPageDirty(page); 468 printk("%s: orphaned page\n", __func__); 469 return PAGE_CLEAN; 470 } 471 } 472 return PAGE_KEEP; 473 } 474 if (mapping->a_ops->writepage == NULL) 475 return PAGE_ACTIVATE; 476 if (!may_write_to_queue(mapping->backing_dev_info, sc)) 477 return PAGE_KEEP; 478 479 if (clear_page_dirty_for_io(page)) { 480 int res; 481 struct writeback_control wbc = { 482 .sync_mode = WB_SYNC_NONE, 483 .nr_to_write = SWAP_CLUSTER_MAX, 484 .range_start = 0, 485 .range_end = LLONG_MAX, 486 .for_reclaim = 1, 487 }; 488 489 SetPageReclaim(page); 490 res = mapping->a_ops->writepage(page, &wbc); 491 if (res < 0) 492 handle_write_error(mapping, page, res); 493 if (res == AOP_WRITEPAGE_ACTIVATE) { 494 ClearPageReclaim(page); 495 return PAGE_ACTIVATE; 496 } 497 498 if (!PageWriteback(page)) { 499 /* synchronous write or broken a_ops? */ 500 ClearPageReclaim(page); 501 } 502 trace_mm_vmscan_writepage(page, 503 trace_reclaim_flags(page, sc->reclaim_mode)); 504 inc_zone_page_state(page, NR_VMSCAN_WRITE); 505 return PAGE_SUCCESS; 506 } 507 508 return PAGE_CLEAN; 509} 510 511/* 512 * Same as remove_mapping, but if the page is removed from the mapping, it 513 * gets returned with a refcount of 0. 514 */ 515static int __remove_mapping(struct address_space *mapping, struct page *page) 516{ 517 BUG_ON(!PageLocked(page)); 518 BUG_ON(mapping != page_mapping(page)); 519 520 spin_lock_irq(&mapping->tree_lock); 521 /* 522 * The non racy check for a busy page. 523 * 524 * Must be careful with the order of the tests. When someone has 525 * a ref to the page, it may be possible that they dirty it then 526 * drop the reference. So if PageDirty is tested before page_count 527 * here, then the following race may occur: 528 * 529 * get_user_pages(&page); 530 * [user mapping goes away] 531 * write_to(page); 532 * !PageDirty(page) [good] 533 * SetPageDirty(page); 534 * put_page(page); 535 * !page_count(page) [good, discard it] 536 * 537 * [oops, our write_to data is lost] 538 * 539 * Reversing the order of the tests ensures such a situation cannot 540 * escape unnoticed. The smp_rmb is needed to ensure the page->flags 541 * load is not satisfied before that of page->_count. 542 * 543 * Note that if SetPageDirty is always performed via set_page_dirty, 544 * and thus under tree_lock, then this ordering is not required. 545 */ 546 if (!page_freeze_refs(page, 2)) 547 goto cannot_free; 548 /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ 549 if (unlikely(PageDirty(page))) { 550 page_unfreeze_refs(page, 2); 551 goto cannot_free; 552 } 553 554 if (PageSwapCache(page)) { 555 swp_entry_t swap = { .val = page_private(page) }; 556 __delete_from_swap_cache(page); 557 spin_unlock_irq(&mapping->tree_lock); 558 swapcache_free(swap, page); 559 } else { 560 void (*freepage)(struct page *); 561 562 freepage = mapping->a_ops->freepage; 563 564 __delete_from_page_cache(page); 565 spin_unlock_irq(&mapping->tree_lock); 566 mem_cgroup_uncharge_cache_page(page); 567 568 if (freepage != NULL) 569 freepage(page); 570 } 571 572 return 1; 573 574cannot_free: 575 spin_unlock_irq(&mapping->tree_lock); 576 return 0; 577} 578 579/* 580 * Attempt to detach a locked page from its ->mapping. If it is dirty or if 581 * someone else has a ref on the page, abort and return 0. If it was 582 * successfully detached, return 1. Assumes the caller has a single ref on 583 * this page. 584 */ 585int remove_mapping(struct address_space *mapping, struct page *page) 586{ 587 if (__remove_mapping(mapping, page)) { 588 /* 589 * Unfreezing the refcount with 1 rather than 2 effectively 590 * drops the pagecache ref for us without requiring another 591 * atomic operation. 592 */ 593 page_unfreeze_refs(page, 1); 594 return 1; 595 } 596 return 0; 597} 598 599/** 600 * putback_lru_page - put previously isolated page onto appropriate LRU list 601 * @page: page to be put back to appropriate lru list 602 * 603 * Add previously isolated @page to appropriate LRU list. 604 * Page may still be unevictable for other reasons. 605 * 606 * lru_lock must not be held, interrupts must be enabled. 607 */ 608void putback_lru_page(struct page *page) 609{ 610 int lru; 611 int active = !!TestClearPageActive(page); 612 int was_unevictable = PageUnevictable(page); 613 614 VM_BUG_ON(PageLRU(page)); 615 616redo: 617 ClearPageUnevictable(page); 618 619 if (page_evictable(page, NULL)) { 620 /* 621 * For evictable pages, we can use the cache. 622 * In event of a race, worst case is we end up with an 623 * unevictable page on [in]active list. 624 * We know how to handle that. 625 */ 626 lru = active + page_lru_base_type(page); 627 lru_cache_add_lru(page, lru); 628 } else { 629 /* 630 * Put unevictable pages directly on zone's unevictable 631 * list. 632 */ 633 lru = LRU_UNEVICTABLE; 634 add_page_to_unevictable_list(page); 635 /* 636 * When racing with an mlock clearing (page is 637 * unlocked), make sure that if the other thread does 638 * not observe our setting of PG_lru and fails 639 * isolation, we see PG_mlocked cleared below and move 640 * the page back to the evictable list. 641 * 642 * The other side is TestClearPageMlocked(). 643 */ 644 smp_mb(); 645 } 646 647 /* 648 * page's status can change while we move it among lru. If an evictable 649 * page is on unevictable list, it never be freed. To avoid that, 650 * check after we added it to the list, again. 651 */ 652 if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { 653 if (!isolate_lru_page(page)) { 654 put_page(page); 655 goto redo; 656 } 657 /* This means someone else dropped this page from LRU 658 * So, it will be freed or putback to LRU again. There is 659 * nothing to do here. 660 */ 661 } 662 663 if (was_unevictable && lru != LRU_UNEVICTABLE) 664 count_vm_event(UNEVICTABLE_PGRESCUED); 665 else if (!was_unevictable && lru == LRU_UNEVICTABLE) 666 count_vm_event(UNEVICTABLE_PGCULLED); 667 668 put_page(page); /* drop ref from isolate */ 669} 670 671enum page_references { 672 PAGEREF_RECLAIM, 673 PAGEREF_RECLAIM_CLEAN, 674 PAGEREF_KEEP, 675 PAGEREF_ACTIVATE, 676}; 677 678static enum page_references page_check_references(struct page *page, 679 struct scan_control *sc) 680{ 681 int referenced_ptes, referenced_page; 682 unsigned long vm_flags; 683 684 referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); 685 referenced_page = TestClearPageReferenced(page); 686 687 /* Lumpy reclaim - ignore references */ 688 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) 689 return PAGEREF_RECLAIM; 690 691 /* 692 * Mlock lost the isolation race with us. Let try_to_unmap() 693 * move the page to the unevictable list. 694 */ 695 if (vm_flags & VM_LOCKED) 696 return PAGEREF_RECLAIM; 697 698 if (referenced_ptes) { 699 if (PageAnon(page)) 700 return PAGEREF_ACTIVATE; 701 /* 702 * All mapped pages start out with page table 703 * references from the instantiating fault, so we need 704 * to look twice if a mapped file page is used more 705 * than once. 706 * 707 * Mark it and spare it for another trip around the 708 * inactive list. Another page table reference will 709 * lead to its activation. 710 * 711 * Note: the mark is set for activated pages as well 712 * so that recently deactivated but used pages are 713 * quickly recovered. 714 */ 715 SetPageReferenced(page); 716 717 if (referenced_page) 718 return PAGEREF_ACTIVATE; 719 720 return PAGEREF_KEEP; 721 } 722 723 /* Reclaim if clean, defer dirty pages to writeback */ 724 if (referenced_page && !PageSwapBacked(page)) 725 return PAGEREF_RECLAIM_CLEAN; 726 727 return PAGEREF_RECLAIM; 728} 729 730static noinline_for_stack void free_page_list(struct list_head *free_pages) 731{ 732 struct pagevec freed_pvec; 733 struct page *page, *tmp; 734 735 pagevec_init(&freed_pvec, 1); 736 737 list_for_each_entry_safe(page, tmp, free_pages, lru) { 738 list_del(&page->lru); 739 if (!pagevec_add(&freed_pvec, page)) { 740 __pagevec_free(&freed_pvec); 741 pagevec_reinit(&freed_pvec); 742 } 743 } 744 745 pagevec_free(&freed_pvec); 746} 747 748/* 749 * shrink_page_list() returns the number of reclaimed pages 750 */ 751static unsigned long shrink_page_list(struct list_head *page_list, 752 struct zone *zone, 753 struct scan_control *sc) 754{ 755 LIST_HEAD(ret_pages); 756 LIST_HEAD(free_pages); 757 int pgactivate = 0; 758 unsigned long nr_dirty = 0; 759 unsigned long nr_congested = 0; 760 unsigned long nr_reclaimed = 0; 761 762 cond_resched(); 763 764 while (!list_empty(page_list)) { 765 enum page_references references; 766 struct address_space *mapping; 767 struct page *page; 768 int may_enter_fs; 769 770 cond_resched(); 771 772 page = lru_to_page(page_list); 773 list_del(&page->lru); 774 775 if (!trylock_page(page)) 776 goto keep; 777 778 VM_BUG_ON(PageActive(page)); 779 VM_BUG_ON(page_zone(page) != zone); 780 781 sc->nr_scanned++; 782 783 if (unlikely(!page_evictable(page, NULL))) 784 goto cull_mlocked; 785 786 if (!sc->may_unmap && page_mapped(page)) 787 goto keep_locked; 788 789 /* Double the slab pressure for mapped and swapcache pages */ 790 if (page_mapped(page) || PageSwapCache(page)) 791 sc->nr_scanned++; 792 793 may_enter_fs = (sc->gfp_mask & __GFP_FS) || 794 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 795 796 if (PageWriteback(page)) { 797 /* 798 * Synchronous reclaim cannot queue pages for 799 * writeback due to the possibility of stack overflow 800 * but if it encounters a page under writeback, wait 801 * for the IO to complete. 802 */ 803 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && 804 may_enter_fs) 805 wait_on_page_writeback(page); 806 else { 807 unlock_page(page); 808 goto keep_lumpy; 809 } 810 } 811 812 references = page_check_references(page, sc); 813 switch (references) { 814 case PAGEREF_ACTIVATE: 815 goto activate_locked; 816 case PAGEREF_KEEP: 817 goto keep_locked; 818 case PAGEREF_RECLAIM: 819 case PAGEREF_RECLAIM_CLEAN: 820 ; /* try to reclaim the page below */ 821 } 822 823 /* 824 * Anonymous process memory has backing store? 825 * Try to allocate it some swap space here. 826 */ 827 if (PageAnon(page) && !PageSwapCache(page)) { 828 if (!(sc->gfp_mask & __GFP_IO)) 829 goto keep_locked; 830 if (!add_to_swap(page)) 831 goto activate_locked; 832 may_enter_fs = 1; 833 } 834 835 mapping = page_mapping(page); 836 837 /* 838 * The page is mapped into the page tables of one or more 839 * processes. Try to unmap it here. 840 */ 841 if (page_mapped(page) && mapping) { 842 switch (try_to_unmap(page, TTU_UNMAP)) { 843 case SWAP_FAIL: 844 goto activate_locked; 845 case SWAP_AGAIN: 846 goto keep_locked; 847 case SWAP_MLOCK: 848 goto cull_mlocked; 849 case SWAP_SUCCESS: 850 ; /* try to free the page below */ 851 } 852 } 853 854 if (PageDirty(page)) { 855 nr_dirty++; 856 857 /* 858 * Only kswapd can writeback filesystem pages to 859 * avoid risk of stack overflow 860 */ 861 if (page_is_file_cache(page) && !current_is_kswapd()) { 862 inc_zone_page_state(page, NR_VMSCAN_WRITE_SKIP); 863 goto keep_locked; 864 } 865 866 if (references == PAGEREF_RECLAIM_CLEAN) 867 goto keep_locked; 868 if (!may_enter_fs) 869 goto keep_locked; 870 if (!sc->may_writepage) 871 goto keep_locked; 872 873 /* Page is dirty, try to write it out here */ 874 switch (pageout(page, mapping, sc)) { 875 case PAGE_KEEP: 876 nr_congested++; 877 goto keep_locked; 878 case PAGE_ACTIVATE: 879 goto activate_locked; 880 case PAGE_SUCCESS: 881 if (PageWriteback(page)) 882 goto keep_lumpy; 883 if (PageDirty(page)) 884 goto keep; 885 886 /* 887 * A synchronous write - probably a ramdisk. Go 888 * ahead and try to reclaim the page. 889 */ 890 if (!trylock_page(page)) 891 goto keep; 892 if (PageDirty(page) || PageWriteback(page)) 893 goto keep_locked; 894 mapping = page_mapping(page); 895 case PAGE_CLEAN: 896 ; /* try to free the page below */ 897 } 898 } 899 900 /* 901 * If the page has buffers, try to free the buffer mappings 902 * associated with this page. If we succeed we try to free 903 * the page as well. 904 * 905 * We do this even if the page is PageDirty(). 906 * try_to_release_page() does not perform I/O, but it is 907 * possible for a page to have PageDirty set, but it is actually 908 * clean (all its buffers are clean). This happens if the 909 * buffers were written out directly, with submit_bh(). ext3 910 * will do this, as well as the blockdev mapping. 911 * try_to_release_page() will discover that cleanness and will 912 * drop the buffers and mark the page clean - it can be freed. 913 * 914 * Rarely, pages can have buffers and no ->mapping. These are 915 * the pages which were not successfully invalidated in 916 * truncate_complete_page(). We try to drop those buffers here 917 * and if that worked, and the page is no longer mapped into 918 * process address space (page_count == 1) it can be freed. 919 * Otherwise, leave the page on the LRU so it is swappable. 920 */ 921 if (page_has_private(page)) { 922 if (!try_to_release_page(page, sc->gfp_mask)) 923 goto activate_locked; 924 if (!mapping && page_count(page) == 1) { 925 unlock_page(page); 926 if (put_page_testzero(page)) 927 goto free_it; 928 else { 929 /* 930 * rare race with speculative reference. 931 * the speculative reference will free 932 * this page shortly, so we may 933 * increment nr_reclaimed here (and 934 * leave it off the LRU). 935 */ 936 nr_reclaimed++; 937 continue; 938 } 939 } 940 } 941 942 if (!mapping || !__remove_mapping(mapping, page)) 943 goto keep_locked; 944 945 /* 946 * At this point, we have no other references and there is 947 * no way to pick any more up (removed from LRU, removed 948 * from pagecache). Can use non-atomic bitops now (and 949 * we obviously don't have to worry about waking up a process 950 * waiting on the page lock, because there are no references. 951 */ 952 __clear_page_locked(page); 953free_it: 954 nr_reclaimed++; 955 956 /* 957 * Is there need to periodically free_page_list? It would 958 * appear not as the counts should be low 959 */ 960 list_add(&page->lru, &free_pages); 961 continue; 962 963cull_mlocked: 964 if (PageSwapCache(page)) 965 try_to_free_swap(page); 966 unlock_page(page); 967 putback_lru_page(page); 968 reset_reclaim_mode(sc); 969 continue; 970 971activate_locked: 972 /* Not a candidate for swapping, so reclaim swap space. */ 973 if (PageSwapCache(page) && vm_swap_full()) 974 try_to_free_swap(page); 975 VM_BUG_ON(PageActive(page)); 976 SetPageActive(page); 977 pgactivate++; 978keep_locked: 979 unlock_page(page); 980keep: 981 reset_reclaim_mode(sc); 982keep_lumpy: 983 list_add(&page->lru, &ret_pages); 984 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 985 } 986 987 /* 988 * Tag a zone as congested if all the dirty pages encountered were 989 * backed by a congested BDI. In this case, reclaimers should just 990 * back off and wait for congestion to clear because further reclaim 991 * will encounter the same problem 992 */ 993 if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc)) 994 zone_set_flag(zone, ZONE_CONGESTED); 995 996 free_page_list(&free_pages); 997 998 list_splice(&ret_pages, page_list); 999 count_vm_events(PGACTIVATE, pgactivate); 1000 return nr_reclaimed; 1001} 1002 1003/* 1004 * Attempt to remove the specified page from its LRU. Only take this page 1005 * if it is of the appropriate PageActive status. Pages which are being 1006 * freed elsewhere are also ignored. 1007 * 1008 * page: page to consider 1009 * mode: one of the LRU isolation modes defined above 1010 * 1011 * returns 0 on success, -ve errno on failure. 1012 */ 1013int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) 1014{ 1015 bool all_lru_mode; 1016 int ret = -EINVAL; 1017 1018 /* Only take pages on the LRU. */ 1019 if (!PageLRU(page)) 1020 return ret; 1021 1022 all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) == 1023 (ISOLATE_ACTIVE|ISOLATE_INACTIVE); 1024 1025 /* 1026 * When checking the active state, we need to be sure we are 1027 * dealing with comparible boolean values. Take the logical not 1028 * of each. 1029 */ 1030 if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE)) 1031 return ret; 1032 1033 if (!all_lru_mode && !!page_is_file_cache(page) != file) 1034 return ret; 1035 1036 /* 1037 * When this function is being called for lumpy reclaim, we 1038 * initially look into all LRU pages, active, inactive and 1039 * unevictable; only give shrink_page_list evictable pages. 1040 */ 1041 if (PageUnevictable(page)) 1042 return ret; 1043 1044 ret = -EBUSY; 1045 1046 if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page))) 1047 return ret; 1048 1049 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) 1050 return ret; 1051 1052 if (likely(get_page_unless_zero(page))) { 1053 /* 1054 * Be careful not to clear PageLRU until after we're 1055 * sure the page is not being freed elsewhere -- the 1056 * page release code relies on it. 1057 */ 1058 ClearPageLRU(page); 1059 ret = 0; 1060 } 1061 1062 return ret; 1063} 1064 1065/* 1066 * zone->lru_lock is heavily contended. Some of the functions that 1067 * shrink the lists perform better by taking out a batch of pages 1068 * and working on them outside the LRU lock. 1069 * 1070 * For pagecache intensive workloads, this function is the hottest 1071 * spot in the kernel (apart from copy_*_user functions). 1072 * 1073 * Appropriate locks must be held before calling this function. 1074 * 1075 * @nr_to_scan: The number of pages to look through on the list. 1076 * @src: The LRU list to pull pages off. 1077 * @dst: The temp list to put pages on to. 1078 * @scanned: The number of pages that were scanned. 1079 * @order: The caller's attempted allocation order 1080 * @mode: One of the LRU isolation modes 1081 * @file: True [1] if isolating file [!anon] pages 1082 * 1083 * returns how many pages were moved onto *@dst. 1084 */ 1085static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1086 struct list_head *src, struct list_head *dst, 1087 unsigned long *scanned, int order, isolate_mode_t mode, 1088 int file) 1089{ 1090 unsigned long nr_taken = 0; 1091 unsigned long nr_lumpy_taken = 0; 1092 unsigned long nr_lumpy_dirty = 0; 1093 unsigned long nr_lumpy_failed = 0; 1094 unsigned long scan; 1095 1096 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 1097 struct page *page; 1098 unsigned long pfn; 1099 unsigned long end_pfn; 1100 unsigned long page_pfn; 1101 int zone_id; 1102 1103 page = lru_to_page(src); 1104 prefetchw_prev_lru_page(page, src, flags); 1105 1106 VM_BUG_ON(!PageLRU(page)); 1107 1108 switch (__isolate_lru_page(page, mode, file)) { 1109 case 0: 1110 list_move(&page->lru, dst); 1111 mem_cgroup_del_lru(page); 1112 nr_taken += hpage_nr_pages(page); 1113 break; 1114 1115 case -EBUSY: 1116 /* else it is being freed elsewhere */ 1117 list_move(&page->lru, src); 1118 mem_cgroup_rotate_lru_list(page, page_lru(page)); 1119 continue; 1120 1121 default: 1122 BUG(); 1123 } 1124 1125 if (!order) 1126 continue; 1127 1128 /* 1129 * Attempt to take all pages in the order aligned region 1130 * surrounding the tag page. Only take those pages of 1131 * the same active state as that tag page. We may safely 1132 * round the target page pfn down to the requested order 1133 * as the mem_map is guaranteed valid out to MAX_ORDER, 1134 * where that page is in a different zone we will detect 1135 * it from its zone id and abort this block scan. 1136 */ 1137 zone_id = page_zone_id(page); 1138 page_pfn = page_to_pfn(page); 1139 pfn = page_pfn & ~((1 << order) - 1); 1140 end_pfn = pfn + (1 << order); 1141 for (; pfn < end_pfn; pfn++) { 1142 struct page *cursor_page; 1143 1144 /* The target page is in the block, ignore it. */ 1145 if (unlikely(pfn == page_pfn)) 1146 continue; 1147 1148 /* Avoid holes within the zone. */ 1149 if (unlikely(!pfn_valid_within(pfn))) 1150 break; 1151 1152 cursor_page = pfn_to_page(pfn); 1153 1154 /* Check that we have not crossed a zone boundary. */ 1155 if (unlikely(page_zone_id(cursor_page) != zone_id)) 1156 break; 1157 1158 /* 1159 * If we don't have enough swap space, reclaiming of 1160 * anon page which don't already have a swap slot is 1161 * pointless. 1162 */ 1163 if (nr_swap_pages <= 0 && PageAnon(cursor_page) && 1164 !PageSwapCache(cursor_page)) 1165 break; 1166 1167 if (__isolate_lru_page(cursor_page, mode, file) == 0) { 1168 list_move(&cursor_page->lru, dst); 1169 mem_cgroup_del_lru(cursor_page); 1170 nr_taken += hpage_nr_pages(page); 1171 nr_lumpy_taken++; 1172 if (PageDirty(cursor_page)) 1173 nr_lumpy_dirty++; 1174 scan++; 1175 } else { 1176 /* 1177 * Check if the page is freed already. 1178 * 1179 * We can't use page_count() as that 1180 * requires compound_head and we don't 1181 * have a pin on the page here. If a 1182 * page is tail, we may or may not 1183 * have isolated the head, so assume 1184 * it's not free, it'd be tricky to 1185 * track the head status without a 1186 * page pin. 1187 */ 1188 if (!PageTail(cursor_page) && 1189 !atomic_read(&cursor_page->_count)) 1190 continue; 1191 break; 1192 } 1193 } 1194 1195 /* If we break out of the loop above, lumpy reclaim failed */ 1196 if (pfn < end_pfn) 1197 nr_lumpy_failed++; 1198 } 1199 1200 *scanned = scan; 1201 1202 trace_mm_vmscan_lru_isolate(order, 1203 nr_to_scan, scan, 1204 nr_taken, 1205 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, 1206 mode); 1207 return nr_taken; 1208} 1209 1210static unsigned long isolate_pages_global(unsigned long nr, 1211 struct list_head *dst, 1212 unsigned long *scanned, int order, 1213 isolate_mode_t mode, 1214 struct zone *z, int active, int file) 1215{ 1216 int lru = LRU_BASE; 1217 if (active) 1218 lru += LRU_ACTIVE; 1219 if (file) 1220 lru += LRU_FILE; 1221 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, 1222 mode, file); 1223} 1224 1225/* 1226 * clear_active_flags() is a helper for shrink_active_list(), clearing 1227 * any active bits from the pages in the list. 1228 */ 1229static unsigned long clear_active_flags(struct list_head *page_list, 1230 unsigned int *count) 1231{ 1232 int nr_active = 0; 1233 int lru; 1234 struct page *page; 1235 1236 list_for_each_entry(page, page_list, lru) { 1237 int numpages = hpage_nr_pages(page); 1238 lru = page_lru_base_type(page); 1239 if (PageActive(page)) { 1240 lru += LRU_ACTIVE; 1241 ClearPageActive(page); 1242 nr_active += numpages; 1243 } 1244 if (count) 1245 count[lru] += numpages; 1246 } 1247 1248 return nr_active; 1249} 1250 1251/** 1252 * isolate_lru_page - tries to isolate a page from its LRU list 1253 * @page: page to isolate from its LRU list 1254 * 1255 * Isolates a @page from an LRU list, clears PageLRU and adjusts the 1256 * vmstat statistic corresponding to whatever LRU list the page was on. 1257 * 1258 * Returns 0 if the page was removed from an LRU list. 1259 * Returns -EBUSY if the page was not on an LRU list. 1260 * 1261 * The returned page will have PageLRU() cleared. If it was found on 1262 * the active list, it will have PageActive set. If it was found on 1263 * the unevictable list, it will have the PageUnevictable bit set. That flag 1264 * may need to be cleared by the caller before letting the page go. 1265 * 1266 * The vmstat statistic corresponding to the list on which the page was 1267 * found will be decremented. 1268 * 1269 * Restrictions: 1270 * (1) Must be called with an elevated refcount on the page. This is a 1271 * fundamentnal difference from isolate_lru_pages (which is called 1272 * without a stable reference). 1273 * (2) the lru_lock must not be held. 1274 * (3) interrupts must be enabled. 1275 */ 1276int isolate_lru_page(struct page *page) 1277{ 1278 int ret = -EBUSY; 1279 1280 VM_BUG_ON(!page_count(page)); 1281 1282 if (PageLRU(page)) { 1283 struct zone *zone = page_zone(page); 1284 1285 spin_lock_irq(&zone->lru_lock); 1286 if (PageLRU(page)) { 1287 int lru = page_lru(page); 1288 ret = 0; 1289 get_page(page); 1290 ClearPageLRU(page); 1291 1292 del_page_from_lru_list(zone, page, lru); 1293 } 1294 spin_unlock_irq(&zone->lru_lock); 1295 } 1296 return ret; 1297} 1298 1299/* 1300 * Are there way too many processes in the direct reclaim path already? 1301 */ 1302static int too_many_isolated(struct zone *zone, int file, 1303 struct scan_control *sc) 1304{ 1305 unsigned long inactive, isolated; 1306 1307 if (current_is_kswapd()) 1308 return 0; 1309 1310 if (!scanning_global_lru(sc)) 1311 return 0; 1312 1313 if (file) { 1314 inactive = zone_page_state(zone, NR_INACTIVE_FILE); 1315 isolated = zone_page_state(zone, NR_ISOLATED_FILE); 1316 } else { 1317 inactive = zone_page_state(zone, NR_INACTIVE_ANON); 1318 isolated = zone_page_state(zone, NR_ISOLATED_ANON); 1319 } 1320 1321 return isolated > inactive; 1322} 1323 1324/* 1325 * TODO: Try merging with migrations version of putback_lru_pages 1326 */ 1327static noinline_for_stack void 1328putback_lru_pages(struct zone *zone, struct scan_control *sc, 1329 unsigned long nr_anon, unsigned long nr_file, 1330 struct list_head *page_list) 1331{ 1332 struct page *page; 1333 struct pagevec pvec; 1334 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1335 1336 pagevec_init(&pvec, 1); 1337 1338 /* 1339 * Put back any unfreeable pages. 1340 */ 1341 spin_lock(&zone->lru_lock); 1342 while (!list_empty(page_list)) { 1343 int lru; 1344 page = lru_to_page(page_list); 1345 VM_BUG_ON(PageLRU(page)); 1346 list_del(&page->lru); 1347 if (unlikely(!page_evictable(page, NULL))) { 1348 spin_unlock_irq(&zone->lru_lock); 1349 putback_lru_page(page); 1350 spin_lock_irq(&zone->lru_lock); 1351 continue; 1352 } 1353 SetPageLRU(page); 1354 lru = page_lru(page); 1355 add_page_to_lru_list(zone, page, lru); 1356 if (is_active_lru(lru)) { 1357 int file = is_file_lru(lru); 1358 int numpages = hpage_nr_pages(page); 1359 reclaim_stat->recent_rotated[file] += numpages; 1360 } 1361 if (!pagevec_add(&pvec, page)) { 1362 spin_unlock_irq(&zone->lru_lock); 1363 __pagevec_release(&pvec); 1364 spin_lock_irq(&zone->lru_lock); 1365 } 1366 } 1367 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); 1368 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); 1369 1370 spin_unlock_irq(&zone->lru_lock); 1371 pagevec_release(&pvec); 1372} 1373 1374static noinline_for_stack void update_isolated_counts(struct zone *zone, 1375 struct scan_control *sc, 1376 unsigned long *nr_anon, 1377 unsigned long *nr_file, 1378 struct list_head *isolated_list) 1379{ 1380 unsigned long nr_active; 1381 unsigned int count[NR_LRU_LISTS] = { 0, }; 1382 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1383 1384 nr_active = clear_active_flags(isolated_list, count); 1385 __count_vm_events(PGDEACTIVATE, nr_active); 1386 1387 __mod_zone_page_state(zone, NR_ACTIVE_FILE, 1388 -count[LRU_ACTIVE_FILE]); 1389 __mod_zone_page_state(zone, NR_INACTIVE_FILE, 1390 -count[LRU_INACTIVE_FILE]); 1391 __mod_zone_page_state(zone, NR_ACTIVE_ANON, 1392 -count[LRU_ACTIVE_ANON]); 1393 __mod_zone_page_state(zone, NR_INACTIVE_ANON, 1394 -count[LRU_INACTIVE_ANON]); 1395 1396 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; 1397 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; 1398 __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); 1399 __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); 1400 1401 reclaim_stat->recent_scanned[0] += *nr_anon; 1402 reclaim_stat->recent_scanned[1] += *nr_file; 1403} 1404 1405/* 1406 * Returns true if a direct reclaim should wait on pages under writeback. 1407 * 1408 * If we are direct reclaiming for contiguous pages and we do not reclaim 1409 * everything in the list, try again and wait for writeback IO to complete. 1410 * This will stall high-order allocations noticeably. Only do that when really 1411 * need to free the pages under high memory pressure. 1412 */ 1413static inline bool should_reclaim_stall(unsigned long nr_taken, 1414 unsigned long nr_freed, 1415 int priority, 1416 struct scan_control *sc) 1417{ 1418 int lumpy_stall_priority; 1419 1420 /* kswapd should not stall on sync IO */ 1421 if (current_is_kswapd()) 1422 return false; 1423 1424 /* Only stall on lumpy reclaim */ 1425 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) 1426 return false; 1427 1428 /* If we have reclaimed everything on the isolated list, no stall */ 1429 if (nr_freed == nr_taken) 1430 return false; 1431 1432 /* 1433 * For high-order allocations, there are two stall thresholds. 1434 * High-cost allocations stall immediately where as lower 1435 * order allocations such as stacks require the scanning 1436 * priority to be much higher before stalling. 1437 */ 1438 if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 1439 lumpy_stall_priority = DEF_PRIORITY; 1440 else 1441 lumpy_stall_priority = DEF_PRIORITY / 3; 1442 1443 return priority <= lumpy_stall_priority; 1444} 1445 1446/* 1447 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1448 * of reclaimed pages 1449 */ 1450static noinline_for_stack unsigned long 1451shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, 1452 struct scan_control *sc, int priority, int file) 1453{ 1454 LIST_HEAD(page_list); 1455 unsigned long nr_scanned; 1456 unsigned long nr_reclaimed = 0; 1457 unsigned long nr_taken; 1458 unsigned long nr_anon; 1459 unsigned long nr_file; 1460 isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; 1461 1462 while (unlikely(too_many_isolated(zone, file, sc))) { 1463 congestion_wait(BLK_RW_ASYNC, HZ/10); 1464 1465 /* We are about to die and free our memory. Return now. */ 1466 if (fatal_signal_pending(current)) 1467 return SWAP_CLUSTER_MAX; 1468 } 1469 1470 set_reclaim_mode(priority, sc, false); 1471 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) 1472 reclaim_mode |= ISOLATE_ACTIVE; 1473 1474 lru_add_drain(); 1475 1476 if (!sc->may_unmap) 1477 reclaim_mode |= ISOLATE_UNMAPPED; 1478 if (!sc->may_writepage) 1479 reclaim_mode |= ISOLATE_CLEAN; 1480 1481 spin_lock_irq(&zone->lru_lock); 1482 1483 if (scanning_global_lru(sc)) { 1484 nr_taken = isolate_pages_global(nr_to_scan, &page_list, 1485 &nr_scanned, sc->order, reclaim_mode, zone, 0, file); 1486 zone->pages_scanned += nr_scanned; 1487 if (current_is_kswapd()) 1488 __count_zone_vm_events(PGSCAN_KSWAPD, zone, 1489 nr_scanned); 1490 else 1491 __count_zone_vm_events(PGSCAN_DIRECT, zone, 1492 nr_scanned); 1493 } else { 1494 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, 1495 &nr_scanned, sc->order, reclaim_mode, zone, 1496 sc->mem_cgroup, 0, file); 1497 /* 1498 * mem_cgroup_isolate_pages() keeps track of 1499 * scanned pages on its own. 1500 */ 1501 } 1502 1503 if (nr_taken == 0) { 1504 spin_unlock_irq(&zone->lru_lock); 1505 return 0; 1506 } 1507 1508 update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list); 1509 1510 spin_unlock_irq(&zone->lru_lock); 1511 1512 nr_reclaimed = shrink_page_list(&page_list, zone, sc); 1513 1514 /* Check if we should syncronously wait for writeback */ 1515 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { 1516 set_reclaim_mode(priority, sc, true); 1517 nr_reclaimed += shrink_page_list(&page_list, zone, sc); 1518 } 1519 1520 local_irq_disable(); 1521 if (current_is_kswapd()) 1522 __count_vm_events(KSWAPD_STEAL, nr_reclaimed); 1523 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); 1524 1525 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); 1526 1527 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1528 zone_idx(zone), 1529 nr_scanned, nr_reclaimed, 1530 priority, 1531 trace_shrink_flags(file, sc->reclaim_mode)); 1532 return nr_reclaimed; 1533} 1534 1535/* 1536 * This moves pages from the active list to the inactive list. 1537 * 1538 * We move them the other way if the page is referenced by one or more 1539 * processes, from rmap. 1540 * 1541 * If the pages are mostly unmapped, the processing is fast and it is 1542 * appropriate to hold zone->lru_lock across the whole operation. But if 1543 * the pages are mapped, the processing is slow (page_referenced()) so we 1544 * should drop zone->lru_lock around each page. It's impossible to balance 1545 * this, so instead we remove the pages from the LRU while processing them. 1546 * It is safe to rely on PG_active against the non-LRU pages in here because 1547 * nobody will play with that bit on a non-LRU page. 1548 * 1549 * The downside is that we have to touch page->_count against each page. 1550 * But we had to alter page->flags anyway. 1551 */ 1552 1553static void move_active_pages_to_lru(struct zone *zone, 1554 struct list_head *list, 1555 enum lru_list lru) 1556{ 1557 unsigned long pgmoved = 0; 1558 struct pagevec pvec; 1559 struct page *page; 1560 1561 pagevec_init(&pvec, 1); 1562 1563 while (!list_empty(list)) { 1564 page = lru_to_page(list); 1565 1566 VM_BUG_ON(PageLRU(page)); 1567 SetPageLRU(page); 1568 1569 list_move(&page->lru, &zone->lru[lru].list); 1570 mem_cgroup_add_lru_list(page, lru); 1571 pgmoved += hpage_nr_pages(page); 1572 1573 if (!pagevec_add(&pvec, page) || list_empty(list)) { 1574 spin_unlock_irq(&zone->lru_lock); 1575 if (buffer_heads_over_limit) 1576 pagevec_strip(&pvec); 1577 __pagevec_release(&pvec); 1578 spin_lock_irq(&zone->lru_lock); 1579 } 1580 } 1581 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); 1582 if (!is_active_lru(lru)) 1583 __count_vm_events(PGDEACTIVATE, pgmoved); 1584} 1585 1586static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1587 struct scan_control *sc, int priority, int file) 1588{ 1589 unsigned long nr_taken; 1590 unsigned long pgscanned; 1591 unsigned long vm_flags; 1592 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1593 LIST_HEAD(l_active); 1594 LIST_HEAD(l_inactive); 1595 struct page *page; 1596 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1597 unsigned long nr_rotated = 0; 1598 isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; 1599 1600 lru_add_drain(); 1601 1602 if (!sc->may_unmap) 1603 reclaim_mode |= ISOLATE_UNMAPPED; 1604 if (!sc->may_writepage) 1605 reclaim_mode |= ISOLATE_CLEAN; 1606 1607 spin_lock_irq(&zone->lru_lock); 1608 if (scanning_global_lru(sc)) { 1609 nr_taken = isolate_pages_global(nr_pages, &l_hold, 1610 &pgscanned, sc->order, 1611 reclaim_mode, zone, 1612 1, file); 1613 zone->pages_scanned += pgscanned; 1614 } else { 1615 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, 1616 &pgscanned, sc->order, 1617 reclaim_mode, zone, 1618 sc->mem_cgroup, 1, file); 1619 /* 1620 * mem_cgroup_isolate_pages() keeps track of 1621 * scanned pages on its own. 1622 */ 1623 } 1624 1625 reclaim_stat->recent_scanned[file] += nr_taken; 1626 1627 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1628 if (file) 1629 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); 1630 else 1631 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken); 1632 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); 1633 spin_unlock_irq(&zone->lru_lock); 1634 1635 while (!list_empty(&l_hold)) { 1636 cond_resched(); 1637 page = lru_to_page(&l_hold); 1638 list_del(&page->lru); 1639 1640 if (unlikely(!page_evictable(page, NULL))) { 1641 putback_lru_page(page); 1642 continue; 1643 } 1644 1645 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { 1646 nr_rotated += hpage_nr_pages(page); 1647 /* 1648 * Identify referenced, file-backed active pages and 1649 * give them one more trip around the active list. So 1650 * that executable code get better chances to stay in 1651 * memory under moderate memory pressure. Anon pages 1652 * are not likely to be evicted by use-once streaming 1653 * IO, plus JVM can create lots of anon VM_EXEC pages, 1654 * so we ignore them here. 1655 */ 1656 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { 1657 list_add(&page->lru, &l_active); 1658 continue; 1659 } 1660 } 1661 1662 ClearPageActive(page); /* we are de-activating */ 1663 list_add(&page->lru, &l_inactive); 1664 } 1665 1666 /* 1667 * Move pages back to the lru list. 1668 */ 1669 spin_lock_irq(&zone->lru_lock); 1670 /* 1671 * Count referenced pages from currently used mappings as rotated, 1672 * even though only some of them are actually re-activated. This 1673 * helps balance scan pressure between file and anonymous pages in 1674 * get_scan_ratio. 1675 */ 1676 reclaim_stat->recent_rotated[file] += nr_rotated; 1677 1678 move_active_pages_to_lru(zone, &l_active, 1679 LRU_ACTIVE + file * LRU_FILE); 1680 move_active_pages_to_lru(zone, &l_inactive, 1681 LRU_BASE + file * LRU_FILE); 1682 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); 1683 spin_unlock_irq(&zone->lru_lock); 1684} 1685 1686#ifdef CONFIG_SWAP 1687static int inactive_anon_is_low_global(struct zone *zone) 1688{ 1689 unsigned long active, inactive; 1690 1691 active = zone_page_state(zone, NR_ACTIVE_ANON); 1692 inactive = zone_page_state(zone, NR_INACTIVE_ANON); 1693 1694 if (inactive * zone->inactive_ratio < active) 1695 return 1; 1696 1697 return 0; 1698} 1699 1700/** 1701 * inactive_anon_is_low - check if anonymous pages need to be deactivated 1702 * @zone: zone to check 1703 * @sc: scan control of this context 1704 * 1705 * Returns true if the zone does not have enough inactive anon pages, 1706 * meaning some active anon pages need to be deactivated. 1707 */ 1708static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) 1709{ 1710 int low; 1711 1712 /* 1713 * If we don't have swap space, anonymous page deactivation 1714 * is pointless. 1715 */ 1716 if (!total_swap_pages) 1717 return 0; 1718 1719 if (scanning_global_lru(sc)) 1720 low = inactive_anon_is_low_global(zone); 1721 else 1722 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); 1723 return low; 1724} 1725#else 1726static inline int inactive_anon_is_low(struct zone *zone, 1727 struct scan_control *sc) 1728{ 1729 return 0; 1730} 1731#endif 1732 1733static int inactive_file_is_low_global(struct zone *zone) 1734{ 1735 unsigned long active, inactive; 1736 1737 active = zone_page_state(zone, NR_ACTIVE_FILE); 1738 inactive = zone_page_state(zone, NR_INACTIVE_FILE); 1739 1740 return (active > inactive); 1741} 1742 1743/** 1744 * inactive_file_is_low - check if file pages need to be deactivated 1745 * @zone: zone to check 1746 * @sc: scan control of this context 1747 * 1748 * When the system is doing streaming IO, memory pressure here 1749 * ensures that active file pages get deactivated, until more 1750 * than half of the file pages are on the inactive list. 1751 * 1752 * Once we get to that situation, protect the system's working 1753 * set from being evicted by disabling active file page aging. 1754 * 1755 * This uses a different ratio than the anonymous pages, because 1756 * the page cache uses a use-once replacement algorithm. 1757 */ 1758static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) 1759{ 1760 int low; 1761 1762 if (scanning_global_lru(sc)) 1763 low = inactive_file_is_low_global(zone); 1764 else 1765 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); 1766 return low; 1767} 1768 1769static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, 1770 int file) 1771{ 1772 if (file) 1773 return inactive_file_is_low(zone, sc); 1774 else 1775 return inactive_anon_is_low(zone, sc); 1776} 1777 1778static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1779 struct zone *zone, struct scan_control *sc, int priority) 1780{ 1781 int file = is_file_lru(lru); 1782 1783 if (is_active_lru(lru)) { 1784 if (inactive_list_is_low(zone, sc, file)) 1785 shrink_active_list(nr_to_scan, zone, sc, priority, file); 1786 return 0; 1787 } 1788 1789 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); 1790} 1791 1792static int vmscan_swappiness(struct scan_control *sc) 1793{ 1794 if (scanning_global_lru(sc)) 1795 return vm_swappiness; 1796 return mem_cgroup_swappiness(sc->mem_cgroup); 1797} 1798 1799/* 1800 * Determine how aggressively the anon and file LRU lists should be 1801 * scanned. The relative value of each set of LRU lists is determined 1802 * by looking at the fraction of the pages scanned we did rotate back 1803 * onto the active list instead of evict. 1804 * 1805 * nr[0] = anon pages to scan; nr[1] = file pages to scan 1806 */ 1807static void get_scan_count(struct zone *zone, struct scan_control *sc, 1808 unsigned long *nr, int priority) 1809{ 1810 unsigned long anon, file, free; 1811 unsigned long anon_prio, file_prio; 1812 unsigned long ap, fp; 1813 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1814 u64 fraction[2], denominator; 1815 enum lru_list l; 1816 int noswap = 0; 1817 bool force_scan = false; 1818 1819 /* 1820 * If the zone or memcg is small, nr[l] can be 0. This 1821 * results in no scanning on this priority and a potential 1822 * priority drop. Global direct reclaim can go to the next 1823 * zone and tends to have no problems. Global kswapd is for 1824 * zone balancing and it needs to scan a minimum amount. When 1825 * reclaiming for a memcg, a priority drop can cause high 1826 * latencies, so it's better to scan a minimum amount there as 1827 * well. 1828 */ 1829 if (scanning_global_lru(sc) && current_is_kswapd()) 1830 force_scan = true; 1831 if (!scanning_global_lru(sc)) 1832 force_scan = true; 1833 1834 /* If we have no swap space, do not bother scanning anon pages. */ 1835 if (!sc->may_swap || (nr_swap_pages <= 0)) { 1836 noswap = 1; 1837 fraction[0] = 0; 1838 fraction[1] = 1; 1839 denominator = 1; 1840 goto out; 1841 } 1842 1843 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + 1844 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); 1845 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + 1846 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); 1847 1848 if (scanning_global_lru(sc)) { 1849 free = zone_page_state(zone, NR_FREE_PAGES); 1850 /* If we have very few page cache pages, 1851 force-scan anon pages. */ 1852 if (unlikely(file + free <= high_wmark_pages(zone))) { 1853 fraction[0] = 1; 1854 fraction[1] = 0; 1855 denominator = 1; 1856 goto out; 1857 } 1858 } 1859 1860 /* 1861 * With swappiness at 100, anonymous and file have the same priority. 1862 * This scanning priority is essentially the inverse of IO cost. 1863 */ 1864 anon_prio = vmscan_swappiness(sc); 1865 file_prio = 200 - vmscan_swappiness(sc); 1866 1867 /* 1868 * OK, so we have swap space and a fair amount of page cache 1869 * pages. We use the recently rotated / recently scanned 1870 * ratios to determine how valuable each cache is. 1871 * 1872 * Because workloads change over time (and to avoid overflow) 1873 * we keep these statistics as a floating average, which ends 1874 * up weighing recent references more than old ones. 1875 * 1876 * anon in [0], file in [1] 1877 */ 1878 spin_lock_irq(&zone->lru_lock); 1879 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 1880 reclaim_stat->recent_scanned[0] /= 2; 1881 reclaim_stat->recent_rotated[0] /= 2; 1882 } 1883 1884 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { 1885 reclaim_stat->recent_scanned[1] /= 2; 1886 reclaim_stat->recent_rotated[1] /= 2; 1887 } 1888 1889 /* 1890 * The amount of pressure on anon vs file pages is inversely 1891 * proportional to the fraction of recently scanned pages on 1892 * each list that were recently referenced and in active use. 1893 */ 1894 ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); 1895 ap /= reclaim_stat->recent_rotated[0] + 1; 1896 1897 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); 1898 fp /= reclaim_stat->recent_rotated[1] + 1; 1899 spin_unlock_irq(&zone->lru_lock); 1900 1901 fraction[0] = ap; 1902 fraction[1] = fp; 1903 denominator = ap + fp + 1; 1904out: 1905 for_each_evictable_lru(l) { 1906 int file = is_file_lru(l); 1907 unsigned long scan; 1908 1909 scan = zone_nr_lru_pages(zone, sc, l); 1910 if (priority || noswap) { 1911 scan >>= priority; 1912 if (!scan && force_scan) 1913 scan = SWAP_CLUSTER_MAX; 1914 scan = div64_u64(scan * fraction[file], denominator); 1915 } 1916 nr[l] = scan; 1917 } 1918} 1919 1920/* 1921 * Reclaim/compaction depends on a number of pages being freed. To avoid 1922 * disruption to the system, a small number of order-0 pages continue to be 1923 * rotated and reclaimed in the normal fashion. However, by the time we get 1924 * back to the allocator and call try_to_compact_zone(), we ensure that 1925 * there are enough free pages for it to be likely successful 1926 */ 1927static inline bool should_continue_reclaim(struct zone *zone, 1928 unsigned long nr_reclaimed, 1929 unsigned long nr_scanned, 1930 struct scan_control *sc) 1931{ 1932 unsigned long pages_for_compaction; 1933 unsigned long inactive_lru_pages; 1934 1935 /* If not in reclaim/compaction mode, stop */ 1936 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) 1937 return false; 1938 1939 /* Consider stopping depending on scan and reclaim activity */ 1940 if (sc->gfp_mask & __GFP_REPEAT) { 1941 /* 1942 * For __GFP_REPEAT allocations, stop reclaiming if the 1943 * full LRU list has been scanned and we are still failing 1944 * to reclaim pages. This full LRU scan is potentially 1945 * expensive but a __GFP_REPEAT caller really wants to succeed 1946 */ 1947 if (!nr_reclaimed && !nr_scanned) 1948 return false; 1949 } else { 1950 /* 1951 * For non-__GFP_REPEAT allocations which can presumably 1952 * fail without consequence, stop if we failed to reclaim 1953 * any pages from the last SWAP_CLUSTER_MAX number of 1954 * pages that were scanned. This will return to the 1955 * caller faster at the risk reclaim/compaction and 1956 * the resulting allocation attempt fails 1957 */ 1958 if (!nr_reclaimed) 1959 return false; 1960 } 1961 1962 /* 1963 * If we have not reclaimed enough pages for compaction and the 1964 * inactive lists are large enough, continue reclaiming 1965 */ 1966 pages_for_compaction = (2UL << sc->order); 1967 inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + 1968 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); 1969 if (sc->nr_reclaimed < pages_for_compaction && 1970 inactive_lru_pages > pages_for_compaction) 1971 return true; 1972 1973 /* If compaction would go ahead or the allocation would succeed, stop */ 1974 switch (compaction_suitable(zone, sc->order)) { 1975 case COMPACT_PARTIAL: 1976 case COMPACT_CONTINUE: 1977 return false; 1978 default: 1979 return true; 1980 } 1981} 1982 1983/* 1984 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1985 */ 1986static void shrink_zone(int priority, struct zone *zone, 1987 struct scan_control *sc) 1988{ 1989 unsigned long nr[NR_LRU_LISTS]; 1990 unsigned long nr_to_scan; 1991 enum lru_list l; 1992 unsigned long nr_reclaimed, nr_scanned; 1993 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1994 struct blk_plug plug; 1995 1996restart: 1997 nr_reclaimed = 0; 1998 nr_scanned = sc->nr_scanned; 1999 get_scan_count(zone, sc, nr, priority); 2000 2001 blk_start_plug(&plug); 2002 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 2003 nr[LRU_INACTIVE_FILE]) { 2004 for_each_evictable_lru(l) { 2005 if (nr[l]) { 2006 nr_to_scan = min_t(unsigned long, 2007 nr[l], SWAP_CLUSTER_MAX); 2008 nr[l] -= nr_to_scan; 2009 2010 nr_reclaimed += shrink_list(l, nr_to_scan, 2011 zone, sc, priority); 2012 } 2013 } 2014 /* 2015 * On large memory systems, scan >> priority can become 2016 * really large. This is fine for the starting priority; 2017 * we want to put equal scanning pressure on each zone. 2018 * However, if the VM has a harder time of freeing pages, 2019 * with multiple processes reclaiming pages, the total 2020 * freeing target can get unreasonably large. 2021 */ 2022 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) 2023 break; 2024 } 2025 blk_finish_plug(&plug); 2026 sc->nr_reclaimed += nr_reclaimed; 2027 2028 /* 2029 * Even if we did not try to evict anon pages at all, we want to 2030 * rebalance the anon lru active/inactive ratio. 2031 */ 2032 if (inactive_anon_is_low(zone, sc)) 2033 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 2034 2035 /* reclaim/compaction might need reclaim to continue */ 2036 if (should_continue_reclaim(zone, nr_reclaimed, 2037 sc->nr_scanned - nr_scanned, sc)) 2038 goto restart; 2039 2040 throttle_vm_writeout(sc->gfp_mask); 2041} 2042 2043/* 2044 * This is the direct reclaim path, for page-allocating processes. We only 2045 * try to reclaim pages from zones which will satisfy the caller's allocation 2046 * request. 2047 * 2048 * We reclaim from a zone even if that zone is over high_wmark_pages(zone). 2049 * Because: 2050 * a) The caller may be trying to free *extra* pages to satisfy a higher-order 2051 * allocation or 2052 * b) The target zone may be at high_wmark_pages(zone) but the lower zones 2053 * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' 2054 * zone defense algorithm. 2055 * 2056 * If a zone is deemed to be full of pinned pages then just give it a light 2057 * scan then give up on it. 2058 */ 2059static void shrink_zones(int priority, struct zonelist *zonelist, 2060 struct scan_control *sc) 2061{ 2062 struct zoneref *z; 2063 struct zone *zone; 2064 unsigned long nr_soft_reclaimed; 2065 unsigned long nr_soft_scanned; 2066 2067 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2068 gfp_zone(sc->gfp_mask), sc->nodemask) { 2069 if (!populated_zone(zone)) 2070 continue; 2071 /* 2072 * Take care memory controller reclaiming has small influence 2073 * to global LRU. 2074 */ 2075 if (scanning_global_lru(sc)) { 2076 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2077 continue; 2078 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2079 continue; /* Let kswapd poll it */ 2080 /* 2081 * This steals pages from memory cgroups over softlimit 2082 * and returns the number of reclaimed pages and 2083 * scanned pages. This works for global memory pressure 2084 * and balancing, not for a memcg's limit. 2085 */ 2086 nr_soft_scanned = 0; 2087 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, 2088 sc->order, sc->gfp_mask, 2089 &nr_soft_scanned); 2090 sc->nr_reclaimed += nr_soft_reclaimed; 2091 sc->nr_scanned += nr_soft_scanned; 2092 /* need some check for avoid more shrink_zone() */ 2093 } 2094 2095 shrink_zone(priority, zone, sc); 2096 } 2097} 2098 2099static bool zone_reclaimable(struct zone *zone) 2100{ 2101 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; 2102} 2103 2104/* All zones in zonelist are unreclaimable? */ 2105static bool all_unreclaimable(struct zonelist *zonelist, 2106 struct scan_control *sc) 2107{ 2108 struct zoneref *z; 2109 struct zone *zone; 2110 2111 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2112 gfp_zone(sc->gfp_mask), sc->nodemask) { 2113 if (!populated_zone(zone)) 2114 continue; 2115 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2116 continue; 2117 if (!zone->all_unreclaimable) 2118 return false; 2119 } 2120 2121 return true; 2122} 2123 2124/* 2125 * This is the main entry point to direct page reclaim. 2126 * 2127 * If a full scan of the inactive list fails to free enough memory then we 2128 * are "out of memory" and something needs to be killed. 2129 * 2130 * If the caller is !__GFP_FS then the probability of a failure is reasonably 2131 * high - the zone may be full of dirty or under-writeback pages, which this 2132 * caller can't do much about. We kick the writeback threads and take explicit 2133 * naps in the hope that some of these pages can be written. But if the 2134 * allocating task holds filesystem locks which prevent writeout this might not 2135 * work, and the allocation attempt will fail. 2136 * 2137 * returns: 0, if no pages reclaimed 2138 * else, the number of pages reclaimed 2139 */ 2140static unsigned long do_try_to_free_pages(struct zonelist *zonelist, 2141 struct scan_control *sc, 2142 struct shrink_control *shrink) 2143{ 2144 int priority; 2145 unsigned long total_scanned = 0; 2146 struct reclaim_state *reclaim_state = current->reclaim_state; 2147 struct zoneref *z; 2148 struct zone *zone; 2149 unsigned long writeback_threshold; 2150 2151 get_mems_allowed(); 2152 delayacct_freepages_start(); 2153 2154 if (scanning_global_lru(sc)) 2155 count_vm_event(ALLOCSTALL); 2156 2157 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2158 sc->nr_scanned = 0; 2159 if (!priority) 2160 disable_swap_token(sc->mem_cgroup); 2161 shrink_zones(priority, zonelist, sc); 2162 /* 2163 * Don't shrink slabs when reclaiming memory from 2164 * over limit cgroups 2165 */ 2166 if (scanning_global_lru(sc)) { 2167 unsigned long lru_pages = 0; 2168 for_each_zone_zonelist(zone, z, zonelist, 2169 gfp_zone(sc->gfp_mask)) { 2170 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2171 continue; 2172 2173 lru_pages += zone_reclaimable_pages(zone); 2174 } 2175 2176 shrink_slab(shrink, sc->nr_scanned, lru_pages); 2177 if (reclaim_state) { 2178 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 2179 reclaim_state->reclaimed_slab = 0; 2180 } 2181 } 2182 total_scanned += sc->nr_scanned; 2183 if (sc->nr_reclaimed >= sc->nr_to_reclaim) 2184 goto out; 2185 2186 /* 2187 * Try to write back as many pages as we just scanned. This 2188 * tends to cause slow streaming writers to write data to the 2189 * disk smoothly, at the dirtying rate, which is nice. But 2190 * that's undesirable in laptop mode, where we *want* lumpy 2191 * writeout. So in laptop mode, write out the whole world. 2192 */ 2193 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; 2194 if (total_scanned > writeback_threshold) { 2195 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); 2196 sc->may_writepage = 1; 2197 } 2198 2199 /* Take a nap, wait for some writeback to complete */ 2200 if (!sc->hibernation_mode && sc->nr_scanned && 2201 priority < DEF_PRIORITY - 2) { 2202 struct zone *preferred_zone; 2203 2204 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), 2205 &cpuset_current_mems_allowed, 2206 &preferred_zone); 2207 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); 2208 } 2209 } 2210 2211out: 2212 delayacct_freepages_end(); 2213 put_mems_allowed(); 2214 2215 if (sc->nr_reclaimed) 2216 return sc->nr_reclaimed; 2217 2218 /* 2219 * As hibernation is going on, kswapd is freezed so that it can't mark 2220 * the zone into all_unreclaimable. Thus bypassing all_unreclaimable 2221 * check. 2222 */ 2223 if (oom_killer_disabled) 2224 return 0; 2225 2226 /* top priority shrink_zones still had more to do? don't OOM, then */ 2227 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) 2228 return 1; 2229 2230 return 0; 2231} 2232 2233unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 2234 gfp_t gfp_mask, nodemask_t *nodemask) 2235{ 2236 unsigned long nr_reclaimed; 2237 struct scan_control sc = { 2238 .gfp_mask = gfp_mask, 2239 .may_writepage = !laptop_mode, 2240 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2241 .may_unmap = 1, 2242 .may_swap = 1, 2243 .order = order, 2244 .mem_cgroup = NULL, 2245 .nodemask = nodemask, 2246 }; 2247 struct shrink_control shrink = { 2248 .gfp_mask = sc.gfp_mask, 2249 }; 2250 2251 trace_mm_vmscan_direct_reclaim_begin(order, 2252 sc.may_writepage, 2253 gfp_mask); 2254 2255 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 2256 2257 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); 2258 2259 return nr_reclaimed; 2260} 2261 2262#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2263 2264unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 2265 gfp_t gfp_mask, bool noswap, 2266 struct zone *zone, 2267 unsigned long *nr_scanned) 2268{ 2269 struct scan_control sc = { 2270 .nr_scanned = 0, 2271 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2272 .may_writepage = !laptop_mode, 2273 .may_unmap = 1, 2274 .may_swap = !noswap, 2275 .order = 0, 2276 .mem_cgroup = mem, 2277 }; 2278 2279 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2280 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2281 2282 trace_mm_vmscan_memcg_softlimit_reclaim_begin(0, 2283 sc.may_writepage, 2284 sc.gfp_mask); 2285 2286 /* 2287 * NOTE: Although we can get the priority field, using it 2288 * here is not a good idea, since it limits the pages we can scan. 2289 * if we don't reclaim here, the shrink_zone from balance_pgdat 2290 * will pick up pages from other mem cgroup's as well. We hack 2291 * the priority and make it zero. 2292 */ 2293 shrink_zone(0, zone, &sc); 2294 2295 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2296 2297 *nr_scanned = sc.nr_scanned; 2298 return sc.nr_reclaimed; 2299} 2300 2301unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 2302 gfp_t gfp_mask, 2303 bool noswap) 2304{ 2305 struct zonelist *zonelist; 2306 unsigned long nr_reclaimed; 2307 int nid; 2308 struct scan_control sc = { 2309 .may_writepage = !laptop_mode, 2310 .may_unmap = 1, 2311 .may_swap = !noswap, 2312 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2313 .order = 0, 2314 .mem_cgroup = mem_cont, 2315 .nodemask = NULL, /* we don't care the placement */ 2316 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2317 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), 2318 }; 2319 struct shrink_control shrink = { 2320 .gfp_mask = sc.gfp_mask, 2321 }; 2322 2323 /* 2324 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't 2325 * take care of from where we get pages. So the node where we start the 2326 * scan does not need to be the current node. 2327 */ 2328 nid = mem_cgroup_select_victim_node(mem_cont); 2329 2330 zonelist = NODE_DATA(nid)->node_zonelists; 2331 2332 trace_mm_vmscan_memcg_reclaim_begin(0, 2333 sc.may_writepage, 2334 sc.gfp_mask); 2335 2336 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 2337 2338 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); 2339 2340 return nr_reclaimed; 2341} 2342#endif 2343 2344/* 2345 * pgdat_balanced is used when checking if a node is balanced for high-order 2346 * allocations. Only zones that meet watermarks and are in a zone allowed 2347 * by the callers classzone_idx are added to balanced_pages. The total of 2348 * balanced pages must be at least 25% of the zones allowed by classzone_idx 2349 * for the node to be considered balanced. Forcing all zones to be balanced 2350 * for high orders can cause excessive reclaim when there are imbalanced zones. 2351 * The choice of 25% is due to 2352 * o a 16M DMA zone that is balanced will not balance a zone on any 2353 * reasonable sized machine 2354 * o On all other machines, the top zone must be at least a reasonable 2355 * percentage of the middle zones. For example, on 32-bit x86, highmem 2356 * would need to be at least 256M for it to be balance a whole node. 2357 * Similarly, on x86-64 the Normal zone would need to be at least 1G 2358 * to balance a node on its own. These seemed like reasonable ratios. 2359 */ 2360static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, 2361 int classzone_idx) 2362{ 2363 unsigned long present_pages = 0; 2364 int i; 2365 2366 for (i = 0; i <= classzone_idx; i++) 2367 present_pages += pgdat->node_zones[i].present_pages; 2368 2369 /* A special case here: if zone has no page, we think it's balanced */ 2370 return balanced_pages >= (present_pages >> 2); 2371} 2372 2373/* is kswapd sleeping prematurely? */ 2374static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, 2375 int classzone_idx) 2376{ 2377 int i; 2378 unsigned long balanced = 0; 2379 bool all_zones_ok = true; 2380 2381 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2382 if (remaining) 2383 return true; 2384 2385 /* Check the watermark levels */ 2386 for (i = 0; i <= classzone_idx; i++) { 2387 struct zone *zone = pgdat->node_zones + i; 2388 2389 if (!populated_zone(zone)) 2390 continue; 2391 2392 /* 2393 * balance_pgdat() skips over all_unreclaimable after 2394 * DEF_PRIORITY. Effectively, it considers them balanced so 2395 * they must be considered balanced here as well if kswapd 2396 * is to sleep 2397 */ 2398 if (zone->all_unreclaimable) { 2399 balanced += zone->present_pages; 2400 continue; 2401 } 2402 2403 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 2404 i, 0)) 2405 all_zones_ok = false; 2406 else 2407 balanced += zone->present_pages; 2408 } 2409 2410 /* 2411 * For high-order requests, the balanced zones must contain at least 2412 * 25% of the nodes pages for kswapd to sleep. For order-0, all zones 2413 * must be balanced 2414 */ 2415 if (order) 2416 return !pgdat_balanced(pgdat, balanced, classzone_idx); 2417 else 2418 return !all_zones_ok; 2419} 2420 2421/* 2422 * For kswapd, balance_pgdat() will work across all this node's zones until 2423 * they are all at high_wmark_pages(zone). 2424 * 2425 * Returns the final order kswapd was reclaiming at 2426 * 2427 * There is special handling here for zones which are full of pinned pages. 2428 * This can happen if the pages are all mlocked, or if they are all used by 2429 * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. 2430 * What we do is to detect the case where all pages in the zone have been 2431 * scanned twice and there has been zero successful reclaim. Mark the zone as 2432 * dead and from now on, only perform a short scan. Basically we're polling 2433 * the zone for when the problem goes away. 2434 * 2435 * kswapd scans the zones in the highmem->normal->dma direction. It skips 2436 * zones which have free_pages > high_wmark_pages(zone), but once a zone is 2437 * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the 2438 * lower zones regardless of the number of free pages in the lower zones. This 2439 * interoperates with the page allocator fallback scheme to ensure that aging 2440 * of pages is balanced across the zones. 2441 */ 2442static unsigned long balance_pgdat(pg_data_t *pgdat, int order, 2443 int *classzone_idx) 2444{ 2445 int all_zones_ok; 2446 unsigned long balanced; 2447 int priority; 2448 int i; 2449 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2450 unsigned long total_scanned; 2451 struct reclaim_state *reclaim_state = current->reclaim_state; 2452 unsigned long nr_soft_reclaimed; 2453 unsigned long nr_soft_scanned; 2454 struct scan_control sc = { 2455 .gfp_mask = GFP_KERNEL, 2456 .may_unmap = 1, 2457 .may_swap = 1, 2458 /* 2459 * kswapd doesn't want to be bailed out while reclaim. because 2460 * we want to put equal scanning pressure on each zone. 2461 */ 2462 .nr_to_reclaim = ULONG_MAX, 2463 .order = order, 2464 .mem_cgroup = NULL, 2465 }; 2466 struct shrink_control shrink = { 2467 .gfp_mask = sc.gfp_mask, 2468 }; 2469loop_again: 2470 total_scanned = 0; 2471 sc.nr_reclaimed = 0; 2472 sc.may_writepage = !laptop_mode; 2473 count_vm_event(PAGEOUTRUN); 2474 2475 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2476 unsigned long lru_pages = 0; 2477 int has_under_min_watermark_zone = 0; 2478 2479 /* The swap token gets in the way of swapout... */ 2480 if (!priority) 2481 disable_swap_token(NULL); 2482 2483 all_zones_ok = 1; 2484 balanced = 0; 2485 2486 /* 2487 * Scan in the highmem->dma direction for the highest 2488 * zone which needs scanning 2489 */ 2490 for (i = pgdat->nr_zones - 1; i >= 0; i--) { 2491 struct zone *zone = pgdat->node_zones + i; 2492 2493 if (!populated_zone(zone)) 2494 continue; 2495 2496 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2497 continue; 2498 2499 /* 2500 * Do some background aging of the anon list, to give 2501 * pages a chance to be referenced before reclaiming. 2502 */ 2503 if (inactive_anon_is_low(zone, &sc)) 2504 shrink_active_list(SWAP_CLUSTER_MAX, zone, 2505 &sc, priority, 0); 2506 2507 if (!zone_watermark_ok_safe(zone, order, 2508 high_wmark_pages(zone), 0, 0)) { 2509 end_zone = i; 2510 break; 2511 } else { 2512 /* If balanced, clear the congested flag */ 2513 zone_clear_flag(zone, ZONE_CONGESTED); 2514 } 2515 } 2516 if (i < 0) 2517 goto out; 2518 2519 for (i = 0; i <= end_zone; i++) { 2520 struct zone *zone = pgdat->node_zones + i; 2521 2522 lru_pages += zone_reclaimable_pages(zone); 2523 } 2524 2525 /* 2526 * Now scan the zone in the dma->highmem direction, stopping 2527 * at the last zone which needs scanning. 2528 * 2529 * We do this because the page allocator works in the opposite 2530 * direction. This prevents the page allocator from allocating 2531 * pages behind kswapd's direction of progress, which would 2532 * cause too much scanning of the lower zones. 2533 */ 2534 for (i = 0; i <= end_zone; i++) { 2535 struct zone *zone = pgdat->node_zones + i; 2536 int nr_slab; 2537 unsigned long balance_gap; 2538 2539 if (!populated_zone(zone)) 2540 continue; 2541 2542 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2543 continue; 2544 2545 sc.nr_scanned = 0; 2546 2547 nr_soft_scanned = 0; 2548 /* 2549 * Call soft limit reclaim before calling shrink_zone. 2550 */ 2551 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, 2552 order, sc.gfp_mask, 2553 &nr_soft_scanned); 2554 sc.nr_reclaimed += nr_soft_reclaimed; 2555 total_scanned += nr_soft_scanned; 2556 2557 /* 2558 * We put equal pressure on every zone, unless 2559 * one zone has way too many pages free 2560 * already. The "too many pages" is defined 2561 * as the high wmark plus a "gap" where the 2562 * gap is either the low watermark or 1% 2563 * of the zone, whichever is smaller. 2564 */ 2565 balance_gap = min(low_wmark_pages(zone), 2566 (zone->present_pages + 2567 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2568 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2569 if (!zone_watermark_ok_safe(zone, order, 2570 high_wmark_pages(zone) + balance_gap, 2571 end_zone, 0)) { 2572 shrink_zone(priority, zone, &sc); 2573 2574 reclaim_state->reclaimed_slab = 0; 2575 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); 2576 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 2577 total_scanned += sc.nr_scanned; 2578 2579 if (nr_slab == 0 && !zone_reclaimable(zone)) 2580 zone->all_unreclaimable = 1; 2581 } 2582 2583 /* 2584 * If we've done a decent amount of scanning and 2585 * the reclaim ratio is low, start doing writepage 2586 * even in laptop mode 2587 */ 2588 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 2589 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2590 sc.may_writepage = 1; 2591 2592 if (zone->all_unreclaimable) { 2593 if (end_zone && end_zone == i) 2594 end_zone--; 2595 continue; 2596 } 2597 2598 if (!zone_watermark_ok_safe(zone, order, 2599 high_wmark_pages(zone), end_zone, 0)) { 2600 all_zones_ok = 0; 2601 /* 2602 * We are still under min water mark. This 2603 * means that we have a GFP_ATOMIC allocation 2604 * failure risk. Hurry up! 2605 */ 2606 if (!zone_watermark_ok_safe(zone, order, 2607 min_wmark_pages(zone), end_zone, 0)) 2608 has_under_min_watermark_zone = 1; 2609 } else { 2610 /* 2611 * If a zone reaches its high watermark, 2612 * consider it to be no longer congested. It's 2613 * possible there are dirty pages backed by 2614 * congested BDIs but as pressure is relieved, 2615 * spectulatively avoid congestion waits 2616 */ 2617 zone_clear_flag(zone, ZONE_CONGESTED); 2618 if (i <= *classzone_idx) 2619 balanced += zone->present_pages; 2620 } 2621 2622 } 2623 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) 2624 break; /* kswapd: all done */ 2625 /* 2626 * OK, kswapd is getting into trouble. Take a nap, then take 2627 * another pass across the zones. 2628 */ 2629 if (total_scanned && (priority < DEF_PRIORITY - 2)) { 2630 if (has_under_min_watermark_zone) 2631 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); 2632 else 2633 congestion_wait(BLK_RW_ASYNC, HZ/10); 2634 } 2635 2636 /* 2637 * We do this so kswapd doesn't build up large priorities for 2638 * example when it is freeing in parallel with allocators. It 2639 * matches the direct reclaim path behaviour in terms of impact 2640 * on zone->*_priority. 2641 */ 2642 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) 2643 break; 2644 } 2645out: 2646 2647 /* 2648 * order-0: All zones must meet high watermark for a balanced node 2649 * high-order: Balanced zones must make up at least 25% of the node 2650 * for the node to be balanced 2651 */ 2652 if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { 2653 cond_resched(); 2654 2655 try_to_freeze(); 2656 2657 /* 2658 * Fragmentation may mean that the system cannot be 2659 * rebalanced for high-order allocations in all zones. 2660 * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, 2661 * it means the zones have been fully scanned and are still 2662 * not balanced. For high-order allocations, there is 2663 * little point trying all over again as kswapd may 2664 * infinite loop. 2665 * 2666 * Instead, recheck all watermarks at order-0 as they 2667 * are the most important. If watermarks are ok, kswapd will go 2668 * back to sleep. High-order users can still perform direct 2669 * reclaim if they wish. 2670 */ 2671 if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) 2672 order = sc.order = 0; 2673 2674 goto loop_again; 2675 } 2676 2677 /* 2678 * If kswapd was reclaiming at a higher order, it has the option of 2679 * sleeping without all zones being balanced. Before it does, it must 2680 * ensure that the watermarks for order-0 on *all* zones are met and 2681 * that the congestion flags are cleared. The congestion flag must 2682 * be cleared as kswapd is the only mechanism that clears the flag 2683 * and it is potentially going to sleep here. 2684 */ 2685 if (order) { 2686 for (i = 0; i <= end_zone; i++) { 2687 struct zone *zone = pgdat->node_zones + i; 2688 2689 if (!populated_zone(zone)) 2690 continue; 2691 2692 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2693 continue; 2694 2695 /* Confirm the zone is balanced for order-0 */ 2696 if (!zone_watermark_ok(zone, 0, 2697 high_wmark_pages(zone), 0, 0)) { 2698 order = sc.order = 0; 2699 goto loop_again; 2700 } 2701 2702 /* If balanced, clear the congested flag */ 2703 zone_clear_flag(zone, ZONE_CONGESTED); 2704 } 2705 } 2706 2707 /* 2708 * Return the order we were reclaiming at so sleeping_prematurely() 2709 * makes a decision on the order we were last reclaiming at. However, 2710 * if another caller entered the allocator slow path while kswapd 2711 * was awake, order will remain at the higher level 2712 */ 2713 *classzone_idx = end_zone; 2714 return order; 2715} 2716 2717static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) 2718{ 2719 long remaining = 0; 2720 DEFINE_WAIT(wait); 2721 2722 if (freezing(current) || kthread_should_stop()) 2723 return; 2724 2725 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2726 2727 /* Try to sleep for a short interval */ 2728 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { 2729 remaining = schedule_timeout(HZ/10); 2730 finish_wait(&pgdat->kswapd_wait, &wait); 2731 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2732 } 2733 2734 /* 2735 * After a short sleep, check if it was a premature sleep. If not, then 2736 * go fully to sleep until explicitly woken up. 2737 */ 2738 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { 2739 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 2740 2741 /* 2742 * vmstat counters are not perfectly accurate and the estimated 2743 * value for counters such as NR_FREE_PAGES can deviate from the 2744 * true value by nr_online_cpus * threshold. To avoid the zone 2745 * watermarks being breached while under pressure, we reduce the 2746 * per-cpu vmstat threshold while kswapd is awake and restore 2747 * them before going back to sleep. 2748 */ 2749 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 2750 schedule(); 2751 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); 2752 } else { 2753 if (remaining) 2754 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); 2755 else 2756 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); 2757 } 2758 finish_wait(&pgdat->kswapd_wait, &wait); 2759} 2760 2761/* 2762 * The background pageout daemon, started as a kernel thread 2763 * from the init process. 2764 * 2765 * This basically trickles out pages so that we have _some_ 2766 * free memory available even if there is no other activity 2767 * that frees anything up. This is needed for things like routing 2768 * etc, where we otherwise might have all activity going on in 2769 * asynchronous contexts that cannot page things out. 2770 * 2771 * If there are applications that are active memory-allocators 2772 * (most normal use), this basically shouldn't matter. 2773 */ 2774static int kswapd(void *p) 2775{ 2776 unsigned long order, new_order; 2777 int classzone_idx, new_classzone_idx; 2778 pg_data_t *pgdat = (pg_data_t*)p; 2779 struct task_struct *tsk = current; 2780 2781 struct reclaim_state reclaim_state = { 2782 .reclaimed_slab = 0, 2783 }; 2784 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 2785 2786 lockdep_set_current_reclaim_state(GFP_KERNEL); 2787 2788 if (!cpumask_empty(cpumask)) 2789 set_cpus_allowed_ptr(tsk, cpumask); 2790 current->reclaim_state = &reclaim_state; 2791 2792 /* 2793 * Tell the memory management that we're a "memory allocator", 2794 * and that if we need more memory we should get access to it 2795 * regardless (see "__alloc_pages()"). "kswapd" should 2796 * never get caught in the normal page freeing logic. 2797 * 2798 * (Kswapd normally doesn't need memory anyway, but sometimes 2799 * you need a small amount of memory in order to be able to 2800 * page out something else, and this flag essentially protects 2801 * us from recursively trying to free more memory as we're 2802 * trying to free the first piece of memory in the first place). 2803 */ 2804 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 2805 set_freezable(); 2806 2807 order = new_order = 0; 2808 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; 2809 for ( ; ; ) { 2810 int ret; 2811 2812 /* 2813 * If the last balance_pgdat was unsuccessful it's unlikely a 2814 * new request of a similar or harder type will succeed soon 2815 * so consider going to sleep on the basis we reclaimed at 2816 */ 2817 if (classzone_idx >= new_classzone_idx && order == new_order) { 2818 new_order = pgdat->kswapd_max_order; 2819 new_classzone_idx = pgdat->classzone_idx; 2820 pgdat->kswapd_max_order = 0; 2821 pgdat->classzone_idx = pgdat->nr_zones - 1; 2822 } 2823 2824 if (order < new_order || classzone_idx > new_classzone_idx) { 2825 /* 2826 * Don't sleep if someone wants a larger 'order' 2827 * allocation or has tigher zone constraints 2828 */ 2829 order = new_order; 2830 classzone_idx = new_classzone_idx; 2831 } else { 2832 kswapd_try_to_sleep(pgdat, order, classzone_idx); 2833 order = pgdat->kswapd_max_order; 2834 classzone_idx = pgdat->classzone_idx; 2835 pgdat->kswapd_max_order = 0; 2836 pgdat->classzone_idx = pgdat->nr_zones - 1; 2837 } 2838 2839 ret = try_to_freeze(); 2840 if (kthread_should_stop()) 2841 break; 2842 2843 /* 2844 * We can speed up thawing tasks if we don't call balance_pgdat 2845 * after returning from the refrigerator 2846 */ 2847 if (!ret) { 2848 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); 2849 order = balance_pgdat(pgdat, order, &classzone_idx); 2850 } 2851 } 2852 return 0; 2853} 2854 2855/* 2856 * A zone is low on free memory, so wake its kswapd task to service it. 2857 */ 2858void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) 2859{ 2860 pg_data_t *pgdat; 2861 2862 if (!populated_zone(zone)) 2863 return; 2864 2865 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2866 return; 2867 pgdat = zone->zone_pgdat; 2868 if (pgdat->kswapd_max_order < order) { 2869 pgdat->kswapd_max_order = order; 2870 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); 2871 } 2872 if (!waitqueue_active(&pgdat->kswapd_wait)) 2873 return; 2874 if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) 2875 return; 2876 2877 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); 2878 wake_up_interruptible(&pgdat->kswapd_wait); 2879} 2880 2881/* 2882 * The reclaimable count would be mostly accurate. 2883 * The less reclaimable pages may be 2884 * - mlocked pages, which will be moved to unevictable list when encountered 2885 * - mapped pages, which may require several travels to be reclaimed 2886 * - dirty pages, which is not "instantly" reclaimable 2887 */ 2888unsigned long global_reclaimable_pages(void) 2889{ 2890 int nr; 2891 2892 nr = global_page_state(NR_ACTIVE_FILE) + 2893 global_page_state(NR_INACTIVE_FILE); 2894 2895 if (nr_swap_pages > 0) 2896 nr += global_page_state(NR_ACTIVE_ANON) + 2897 global_page_state(NR_INACTIVE_ANON); 2898 2899 return nr; 2900} 2901 2902unsigned long zone_reclaimable_pages(struct zone *zone) 2903{ 2904 int nr; 2905 2906 nr = zone_page_state(zone, NR_ACTIVE_FILE) + 2907 zone_page_state(zone, NR_INACTIVE_FILE); 2908 2909 if (nr_swap_pages > 0) 2910 nr += zone_page_state(zone, NR_ACTIVE_ANON) + 2911 zone_page_state(zone, NR_INACTIVE_ANON); 2912 2913 return nr; 2914} 2915 2916#ifdef CONFIG_HIBERNATION 2917/* 2918 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of 2919 * freed pages. 2920 * 2921 * Rather than trying to age LRUs the aim is to preserve the overall 2922 * LRU order by reclaiming preferentially 2923 * inactive > active > active referenced > active mapped 2924 */ 2925unsigned long shrink_all_memory(unsigned long nr_to_reclaim) 2926{ 2927 struct reclaim_state reclaim_state; 2928 struct scan_control sc = { 2929 .gfp_mask = GFP_HIGHUSER_MOVABLE, 2930 .may_swap = 1, 2931 .may_unmap = 1, 2932 .may_writepage = 1, 2933 .nr_to_reclaim = nr_to_reclaim, 2934 .hibernation_mode = 1, 2935 .order = 0, 2936 }; 2937 struct shrink_control shrink = { 2938 .gfp_mask = sc.gfp_mask, 2939 }; 2940 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 2941 struct task_struct *p = current; 2942 unsigned long nr_reclaimed; 2943 2944 p->flags |= PF_MEMALLOC; 2945 lockdep_set_current_reclaim_state(sc.gfp_mask); 2946 reclaim_state.reclaimed_slab = 0; 2947 p->reclaim_state = &reclaim_state; 2948 2949 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 2950 2951 p->reclaim_state = NULL; 2952 lockdep_clear_current_reclaim_state(); 2953 p->flags &= ~PF_MEMALLOC; 2954 2955 return nr_reclaimed; 2956} 2957#endif /* CONFIG_HIBERNATION */ 2958 2959/* It's optimal to keep kswapds on the same CPUs as their memory, but 2960 not required for correctness. So if the last cpu in a node goes 2961 away, we get changed to run anywhere: as the first one comes back, 2962 restore their cpu bindings. */ 2963static int __devinit cpu_callback(struct notifier_block *nfb, 2964 unsigned long action, void *hcpu) 2965{ 2966 int nid; 2967 2968 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { 2969 for_each_node_state(nid, N_HIGH_MEMORY) { 2970 pg_data_t *pgdat = NODE_DATA(nid); 2971 const struct cpumask *mask; 2972 2973 mask = cpumask_of_node(pgdat->node_id); 2974 2975 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) 2976 /* One of our CPUs online: restore mask */ 2977 set_cpus_allowed_ptr(pgdat->kswapd, mask); 2978 } 2979 } 2980 return NOTIFY_OK; 2981} 2982 2983/* 2984 * This kswapd start function will be called by init and node-hot-add. 2985 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. 2986 */ 2987int kswapd_run(int nid) 2988{ 2989 pg_data_t *pgdat = NODE_DATA(nid); 2990 int ret = 0; 2991 2992 if (pgdat->kswapd) 2993 return 0; 2994 2995 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); 2996 if (IS_ERR(pgdat->kswapd)) { 2997 /* failure at boot is fatal */ 2998 BUG_ON(system_state == SYSTEM_BOOTING); 2999 printk("Failed to start kswapd on node %d\n",nid); 3000 ret = -1; 3001 } 3002 return ret; 3003} 3004 3005/* 3006 * Called by memory hotplug when all memory in a node is offlined. 3007 */ 3008void kswapd_stop(int nid) 3009{ 3010 struct task_struct *kswapd = NODE_DATA(nid)->kswapd; 3011 3012 if (kswapd) 3013 kthread_stop(kswapd); 3014} 3015 3016static int __init kswapd_init(void) 3017{ 3018 int nid; 3019 3020 swap_setup(); 3021 for_each_node_state(nid, N_HIGH_MEMORY) 3022 kswapd_run(nid); 3023 hotcpu_notifier(cpu_callback, 0); 3024 return 0; 3025} 3026 3027module_init(kswapd_init) 3028 3029#ifdef CONFIG_NUMA 3030/* 3031 * Zone reclaim mode 3032 * 3033 * If non-zero call zone_reclaim when the number of free pages falls below 3034 * the watermarks. 3035 */ 3036int zone_reclaim_mode __read_mostly; 3037 3038#define RECLAIM_OFF 0 3039#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ 3040#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ 3041#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ 3042 3043/* 3044 * Priority for ZONE_RECLAIM. This determines the fraction of pages 3045 * of a node considered for each zone_reclaim. 4 scans 1/16th of 3046 * a zone. 3047 */ 3048#define ZONE_RECLAIM_PRIORITY 4 3049 3050/* 3051 * Percentage of pages in a zone that must be unmapped for zone_reclaim to 3052 * occur. 3053 */ 3054int sysctl_min_unmapped_ratio = 1; 3055 3056/* 3057 * If the number of slab pages in a zone grows beyond this percentage then 3058 * slab reclaim needs to occur. 3059 */ 3060int sysctl_min_slab_ratio = 5; 3061 3062static inline unsigned long zone_unmapped_file_pages(struct zone *zone) 3063{ 3064 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); 3065 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + 3066 zone_page_state(zone, NR_ACTIVE_FILE); 3067 3068 /* 3069 * It's possible for there to be more file mapped pages than 3070 * accounted for by the pages on the file LRU lists because 3071 * tmpfs pages accounted for as ANON can also be FILE_MAPPED 3072 */ 3073 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; 3074} 3075 3076/* Work out how many page cache pages we can reclaim in this reclaim_mode */ 3077static long zone_pagecache_reclaimable(struct zone *zone) 3078{ 3079 long nr_pagecache_reclaimable; 3080 long delta = 0; 3081 3082 /* 3083 * If RECLAIM_SWAP is set, then all file pages are considered 3084 * potentially reclaimable. Otherwise, we have to worry about 3085 * pages like swapcache and zone_unmapped_file_pages() provides 3086 * a better estimate 3087 */ 3088 if (zone_reclaim_mode & RECLAIM_SWAP) 3089 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); 3090 else 3091 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); 3092 3093 /* If we can't clean pages, remove dirty pages from consideration */ 3094 if (!(zone_reclaim_mode & RECLAIM_WRITE)) 3095 delta += zone_page_state(zone, NR_FILE_DIRTY); 3096 3097 /* Watch for any possible underflows due to delta */ 3098 if (unlikely(delta > nr_pagecache_reclaimable)) 3099 delta = nr_pagecache_reclaimable; 3100 3101 return nr_pagecache_reclaimable - delta; 3102} 3103 3104/* 3105 * Try to free up some pages from this zone through reclaim. 3106 */ 3107static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 3108{ 3109 /* Minimum pages needed in order to stay on node */ 3110 const unsigned long nr_pages = 1 << order; 3111 struct task_struct *p = current; 3112 struct reclaim_state reclaim_state; 3113 int priority; 3114 struct scan_control sc = { 3115 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 3116 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 3117 .may_swap = 1, 3118 .nr_to_reclaim = max_t(unsigned long, nr_pages, 3119 SWAP_CLUSTER_MAX), 3120 .gfp_mask = gfp_mask, 3121 .order = order, 3122 }; 3123 struct shrink_control shrink = { 3124 .gfp_mask = sc.gfp_mask, 3125 }; 3126 unsigned long nr_slab_pages0, nr_slab_pages1; 3127 3128 cond_resched(); 3129 /* 3130 * We need to be able to allocate from the reserves for RECLAIM_SWAP 3131 * and we also need to be able to write out pages for RECLAIM_WRITE 3132 * and RECLAIM_SWAP. 3133 */ 3134 p->flags |= PF_MEMALLOC | PF_SWAPWRITE; 3135 lockdep_set_current_reclaim_state(gfp_mask); 3136 reclaim_state.reclaimed_slab = 0; 3137 p->reclaim_state = &reclaim_state; 3138 3139 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { 3140 /* 3141 * Free memory by calling shrink zone with increasing 3142 * priorities until we have enough memory freed. 3143 */ 3144 priority = ZONE_RECLAIM_PRIORITY; 3145 do { 3146 shrink_zone(priority, zone, &sc); 3147 priority--; 3148 } while (priority >= 0 && sc.nr_reclaimed < nr_pages); 3149 } 3150 3151 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 3152 if (nr_slab_pages0 > zone->min_slab_pages) { 3153 /* 3154 * shrink_slab() does not currently allow us to determine how 3155 * many pages were freed in this zone. So we take the current 3156 * number of slab pages and shake the slab until it is reduced 3157 * by the same nr_pages that we used for reclaiming unmapped 3158 * pages. 3159 * 3160 * Note that shrink_slab will free memory on all zones and may 3161 * take a long time. 3162 */ 3163 for (;;) { 3164 unsigned long lru_pages = zone_reclaimable_pages(zone); 3165 3166 /* No reclaimable slab or very low memory pressure */ 3167 if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages)) 3168 break; 3169 3170 /* Freed enough memory */ 3171 nr_slab_pages1 = zone_page_state(zone, 3172 NR_SLAB_RECLAIMABLE); 3173 if (nr_slab_pages1 + nr_pages <= nr_slab_pages0) 3174 break; 3175 } 3176 3177 /* 3178 * Update nr_reclaimed by the number of slab pages we 3179 * reclaimed from this zone. 3180 */ 3181 nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 3182 if (nr_slab_pages1 < nr_slab_pages0) 3183 sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1; 3184 } 3185 3186 p->reclaim_state = NULL; 3187 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 3188 lockdep_clear_current_reclaim_state(); 3189 return sc.nr_reclaimed >= nr_pages; 3190} 3191 3192int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 3193{ 3194 int node_id; 3195 int ret; 3196 3197 /* 3198 * Zone reclaim reclaims unmapped file backed pages and 3199 * slab pages if we are over the defined limits. 3200 * 3201 * A small portion of unmapped file backed pages is needed for 3202 * file I/O otherwise pages read by file I/O will be immediately 3203 * thrown out if the zone is overallocated. So we do not reclaim 3204 * if less than a specified percentage of the zone is used by 3205 * unmapped file backed pages. 3206 */ 3207 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && 3208 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) 3209 return ZONE_RECLAIM_FULL; 3210 3211 if (zone->all_unreclaimable) 3212 return ZONE_RECLAIM_FULL; 3213 3214 /* 3215 * Do not scan if the allocation should not be delayed. 3216 */ 3217 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) 3218 return ZONE_RECLAIM_NOSCAN; 3219 3220 /* 3221 * Only run zone reclaim on the local zone or on zones that do not 3222 * have associated processors. This will favor the local processor 3223 * over remote processors and spread off node memory allocations 3224 * as wide as possible. 3225 */ 3226 node_id = zone_to_nid(zone); 3227 if (node_state(node_id, N_CPU) && node_id != numa_node_id()) 3228 return ZONE_RECLAIM_NOSCAN; 3229 3230 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) 3231 return ZONE_RECLAIM_NOSCAN; 3232 3233 ret = __zone_reclaim(zone, gfp_mask, order); 3234 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); 3235 3236 if (!ret) 3237 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); 3238 3239 return ret; 3240} 3241#endif 3242 3243/* 3244 * page_evictable - test whether a page is evictable 3245 * @page: the page to test 3246 * @vma: the VMA in which the page is or will be mapped, may be NULL 3247 * 3248 * Test whether page is evictable--i.e., should be placed on active/inactive 3249 * lists vs unevictable list. The vma argument is !NULL when called from the 3250 * fault path to determine how to instantate a new page. 3251 * 3252 * Reasons page might not be evictable: 3253 * (1) page's mapping marked unevictable 3254 * (2) page is part of an mlocked VMA 3255 * 3256 */ 3257int page_evictable(struct page *page, struct vm_area_struct *vma) 3258{ 3259 3260 if (mapping_unevictable(page_mapping(page))) 3261 return 0; 3262 3263 if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) 3264 return 0; 3265 3266 return 1; 3267} 3268 3269/** 3270 * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list 3271 * @page: page to check evictability and move to appropriate lru list 3272 * @zone: zone page is in 3273 * 3274 * Checks a page for evictability and moves the page to the appropriate 3275 * zone lru list. 3276 * 3277 * Restrictions: zone->lru_lock must be held, page must be on LRU and must 3278 * have PageUnevictable set. 3279 */ 3280static void check_move_unevictable_page(struct page *page, struct zone *zone) 3281{ 3282 VM_BUG_ON(PageActive(page)); 3283 3284retry: 3285 ClearPageUnevictable(page); 3286 if (page_evictable(page, NULL)) { 3287 enum lru_list l = page_lru_base_type(page); 3288 3289 __dec_zone_state(zone, NR_UNEVICTABLE); 3290 list_move(&page->lru, &zone->lru[l].list); 3291 mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l); 3292 __inc_zone_state(zone, NR_INACTIVE_ANON + l); 3293 __count_vm_event(UNEVICTABLE_PGRESCUED); 3294 } else { 3295 /* 3296 * rotate unevictable list 3297 */ 3298 SetPageUnevictable(page); 3299 list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); 3300 mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE); 3301 if (page_evictable(page, NULL)) 3302 goto retry; 3303 } 3304} 3305 3306/** 3307 * scan_mapping_unevictable_pages - scan an address space for evictable pages 3308 * @mapping: struct address_space to scan for evictable pages 3309 * 3310 * Scan all pages in mapping. Check unevictable pages for 3311 * evictability and move them to the appropriate zone lru list. 3312 */ 3313void scan_mapping_unevictable_pages(struct address_space *mapping) 3314{ 3315 pgoff_t next = 0; 3316 pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >> 3317 PAGE_CACHE_SHIFT; 3318 struct zone *zone; 3319 struct pagevec pvec; 3320 3321 if (mapping->nrpages == 0) 3322 return; 3323 3324 pagevec_init(&pvec, 0); 3325 while (next < end && 3326 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 3327 int i; 3328 int pg_scanned = 0; 3329 3330 zone = NULL; 3331 3332 for (i = 0; i < pagevec_count(&pvec); i++) { 3333 struct page *page = pvec.pages[i]; 3334 pgoff_t page_index = page->index; 3335 struct zone *pagezone = page_zone(page); 3336 3337 pg_scanned++; 3338 if (page_index > next) 3339 next = page_index; 3340 next++; 3341 3342 if (pagezone != zone) { 3343 if (zone) 3344 spin_unlock_irq(&zone->lru_lock); 3345 zone = pagezone; 3346 spin_lock_irq(&zone->lru_lock); 3347 } 3348 3349 if (PageLRU(page) && PageUnevictable(page)) 3350 check_move_unevictable_page(page, zone); 3351 } 3352 if (zone) 3353 spin_unlock_irq(&zone->lru_lock); 3354 pagevec_release(&pvec); 3355 3356 count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned); 3357 } 3358 3359} 3360 3361/** 3362 * scan_zone_unevictable_pages - check unevictable list for evictable pages 3363 * @zone - zone of which to scan the unevictable list 3364 * 3365 * Scan @zone's unevictable LRU lists to check for pages that have become 3366 * evictable. Move those that have to @zone's inactive list where they 3367 * become candidates for reclaim, unless shrink_inactive_zone() decides 3368 * to reactivate them. Pages that are still unevictable are rotated 3369 * back onto @zone's unevictable list. 3370 */ 3371#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ 3372static void scan_zone_unevictable_pages(struct zone *zone) 3373{ 3374 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; 3375 unsigned long scan; 3376 unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); 3377 3378 while (nr_to_scan > 0) { 3379 unsigned long batch_size = min(nr_to_scan, 3380 SCAN_UNEVICTABLE_BATCH_SIZE); 3381 3382 spin_lock_irq(&zone->lru_lock); 3383 for (scan = 0; scan < batch_size; scan++) { 3384 struct page *page = lru_to_page(l_unevictable); 3385 3386 if (!trylock_page(page)) 3387 continue; 3388 3389 prefetchw_prev_lru_page(page, l_unevictable, flags); 3390 3391 if (likely(PageLRU(page) && PageUnevictable(page))) 3392 check_move_unevictable_page(page, zone); 3393 3394 unlock_page(page); 3395 } 3396 spin_unlock_irq(&zone->lru_lock); 3397 3398 nr_to_scan -= batch_size; 3399 } 3400} 3401 3402 3403/** 3404 * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages 3405 * 3406 * A really big hammer: scan all zones' unevictable LRU lists to check for 3407 * pages that have become evictable. Move those back to the zones' 3408 * inactive list where they become candidates for reclaim. 3409 * This occurs when, e.g., we have unswappable pages on the unevictable lists, 3410 * and we add swap to the system. As such, it runs in the context of a task 3411 * that has possibly/probably made some previously unevictable pages 3412 * evictable. 3413 */ 3414static void scan_all_zones_unevictable_pages(void) 3415{ 3416 struct zone *zone; 3417 3418 for_each_zone(zone) { 3419 scan_zone_unevictable_pages(zone); 3420 } 3421} 3422 3423/* 3424 * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of 3425 * all nodes' unevictable lists for evictable pages 3426 */ 3427unsigned long scan_unevictable_pages; 3428 3429int scan_unevictable_handler(struct ctl_table *table, int write, 3430 void __user *buffer, 3431 size_t *length, loff_t *ppos) 3432{ 3433 proc_doulongvec_minmax(table, write, buffer, length, ppos); 3434 3435 if (write && *(unsigned long *)table->data) 3436 scan_all_zones_unevictable_pages(); 3437 3438 scan_unevictable_pages = 0; 3439 return 0; 3440} 3441 3442#ifdef CONFIG_NUMA 3443/* 3444 * per node 'scan_unevictable_pages' attribute. On demand re-scan of 3445 * a specified node's per zone unevictable lists for evictable pages. 3446 */ 3447 3448static ssize_t read_scan_unevictable_node(struct sys_device *dev, 3449 struct sysdev_attribute *attr, 3450 char *buf) 3451{ 3452 return sprintf(buf, "0\n"); /* always zero; should fit... */ 3453} 3454 3455static ssize_t write_scan_unevictable_node(struct sys_device *dev, 3456 struct sysdev_attribute *attr, 3457 const char *buf, size_t count) 3458{ 3459 struct zone *node_zones = NODE_DATA(dev->id)->node_zones; 3460 struct zone *zone; 3461 unsigned long res; 3462 unsigned long req = strict_strtoul(buf, 10, &res); 3463 3464 if (!req) 3465 return 1; /* zero is no-op */ 3466 3467 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 3468 if (!populated_zone(zone)) 3469 continue; 3470 scan_zone_unevictable_pages(zone); 3471 } 3472 return 1; 3473} 3474 3475 3476static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, 3477 read_scan_unevictable_node, 3478 write_scan_unevictable_node); 3479 3480int scan_unevictable_register_node(struct node *node) 3481{ 3482 return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages); 3483} 3484 3485void scan_unevictable_unregister_node(struct node *node) 3486{ 3487 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); 3488} 3489#endif 3490