vmscan.c revision 918d3f90e8d5657491024f64427e9a5ea632d284
1/* 2 * linux/mm/vmscan.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 * 6 * Swap reorganised 29.12.95, Stephen Tweedie. 7 * kswapd added: 7.1.96 sct 8 * Removed kswapd_ctl limits, and swap out as many pages as needed 9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel. 10 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). 11 * Multiqueue VM started 5.8.00, Rik van Riel. 12 */ 13 14#include <linux/mm.h> 15#include <linux/module.h> 16#include <linux/slab.h> 17#include <linux/kernel_stat.h> 18#include <linux/swap.h> 19#include <linux/pagemap.h> 20#include <linux/init.h> 21#include <linux/highmem.h> 22#include <linux/vmstat.h> 23#include <linux/file.h> 24#include <linux/writeback.h> 25#include <linux/blkdev.h> 26#include <linux/buffer_head.h> /* for try_to_release_page(), 27 buffer_heads_over_limit */ 28#include <linux/mm_inline.h> 29#include <linux/pagevec.h> 30#include <linux/backing-dev.h> 31#include <linux/rmap.h> 32#include <linux/topology.h> 33#include <linux/cpu.h> 34#include <linux/cpuset.h> 35#include <linux/notifier.h> 36#include <linux/rwsem.h> 37#include <linux/delay.h> 38#include <linux/kthread.h> 39#include <linux/freezer.h> 40 41#include <asm/tlbflush.h> 42#include <asm/div64.h> 43 44#include <linux/swapops.h> 45 46#include "internal.h" 47 48struct scan_control { 49 /* Incremented by the number of inactive pages that were scanned */ 50 unsigned long nr_scanned; 51 52 /* This context's GFP mask */ 53 gfp_t gfp_mask; 54 55 int may_writepage; 56 57 /* Can pages be swapped as part of reclaim? */ 58 int may_swap; 59 60 /* This context's SWAP_CLUSTER_MAX. If freeing memory for 61 * suspend, we effectively ignore SWAP_CLUSTER_MAX. 62 * In this context, it doesn't matter that we scan the 63 * whole list at once. */ 64 int swap_cluster_max; 65 66 int swappiness; 67 68 int all_unreclaimable; 69}; 70 71/* 72 * The list of shrinker callbacks used by to apply pressure to 73 * ageable caches. 74 */ 75struct shrinker { 76 shrinker_t shrinker; 77 struct list_head list; 78 int seeks; /* seeks to recreate an obj */ 79 long nr; /* objs pending delete */ 80}; 81 82#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 83 84#ifdef ARCH_HAS_PREFETCH 85#define prefetch_prev_lru_page(_page, _base, _field) \ 86 do { \ 87 if ((_page)->lru.prev != _base) { \ 88 struct page *prev; \ 89 \ 90 prev = lru_to_page(&(_page->lru)); \ 91 prefetch(&prev->_field); \ 92 } \ 93 } while (0) 94#else 95#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) 96#endif 97 98#ifdef ARCH_HAS_PREFETCHW 99#define prefetchw_prev_lru_page(_page, _base, _field) \ 100 do { \ 101 if ((_page)->lru.prev != _base) { \ 102 struct page *prev; \ 103 \ 104 prev = lru_to_page(&(_page->lru)); \ 105 prefetchw(&prev->_field); \ 106 } \ 107 } while (0) 108#else 109#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) 110#endif 111 112/* 113 * From 0 .. 100. Higher means more swappy. 114 */ 115int vm_swappiness = 60; 116long vm_total_pages; /* The total number of pages which the VM controls */ 117 118static LIST_HEAD(shrinker_list); 119static DECLARE_RWSEM(shrinker_rwsem); 120 121/* 122 * Add a shrinker callback to be called from the vm 123 */ 124struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker) 125{ 126 struct shrinker *shrinker; 127 128 shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL); 129 if (shrinker) { 130 shrinker->shrinker = theshrinker; 131 shrinker->seeks = seeks; 132 shrinker->nr = 0; 133 down_write(&shrinker_rwsem); 134 list_add_tail(&shrinker->list, &shrinker_list); 135 up_write(&shrinker_rwsem); 136 } 137 return shrinker; 138} 139EXPORT_SYMBOL(set_shrinker); 140 141/* 142 * Remove one 143 */ 144void remove_shrinker(struct shrinker *shrinker) 145{ 146 down_write(&shrinker_rwsem); 147 list_del(&shrinker->list); 148 up_write(&shrinker_rwsem); 149 kfree(shrinker); 150} 151EXPORT_SYMBOL(remove_shrinker); 152 153#define SHRINK_BATCH 128 154/* 155 * Call the shrink functions to age shrinkable caches 156 * 157 * Here we assume it costs one seek to replace a lru page and that it also 158 * takes a seek to recreate a cache object. With this in mind we age equal 159 * percentages of the lru and ageable caches. This should balance the seeks 160 * generated by these structures. 161 * 162 * If the vm encounted mapped pages on the LRU it increase the pressure on 163 * slab to avoid swapping. 164 * 165 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. 166 * 167 * `lru_pages' represents the number of on-LRU pages in all the zones which 168 * are eligible for the caller's allocation attempt. It is used for balancing 169 * slab reclaim versus page reclaim. 170 * 171 * Returns the number of slab objects which we shrunk. 172 */ 173unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, 174 unsigned long lru_pages) 175{ 176 struct shrinker *shrinker; 177 unsigned long ret = 0; 178 179 if (scanned == 0) 180 scanned = SWAP_CLUSTER_MAX; 181 182 if (!down_read_trylock(&shrinker_rwsem)) 183 return 1; /* Assume we'll be able to shrink next time */ 184 185 list_for_each_entry(shrinker, &shrinker_list, list) { 186 unsigned long long delta; 187 unsigned long total_scan; 188 unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask); 189 190 delta = (4 * scanned) / shrinker->seeks; 191 delta *= max_pass; 192 do_div(delta, lru_pages + 1); 193 shrinker->nr += delta; 194 if (shrinker->nr < 0) { 195 printk(KERN_ERR "%s: nr=%ld\n", 196 __FUNCTION__, shrinker->nr); 197 shrinker->nr = max_pass; 198 } 199 200 /* 201 * Avoid risking looping forever due to too large nr value: 202 * never try to free more than twice the estimate number of 203 * freeable entries. 204 */ 205 if (shrinker->nr > max_pass * 2) 206 shrinker->nr = max_pass * 2; 207 208 total_scan = shrinker->nr; 209 shrinker->nr = 0; 210 211 while (total_scan >= SHRINK_BATCH) { 212 long this_scan = SHRINK_BATCH; 213 int shrink_ret; 214 int nr_before; 215 216 nr_before = (*shrinker->shrinker)(0, gfp_mask); 217 shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask); 218 if (shrink_ret == -1) 219 break; 220 if (shrink_ret < nr_before) 221 ret += nr_before - shrink_ret; 222 count_vm_events(SLABS_SCANNED, this_scan); 223 total_scan -= this_scan; 224 225 cond_resched(); 226 } 227 228 shrinker->nr += total_scan; 229 } 230 up_read(&shrinker_rwsem); 231 return ret; 232} 233 234/* Called without lock on whether page is mapped, so answer is unstable */ 235static inline int page_mapping_inuse(struct page *page) 236{ 237 struct address_space *mapping; 238 239 /* Page is in somebody's page tables. */ 240 if (page_mapped(page)) 241 return 1; 242 243 /* Be more reluctant to reclaim swapcache than pagecache */ 244 if (PageSwapCache(page)) 245 return 1; 246 247 mapping = page_mapping(page); 248 if (!mapping) 249 return 0; 250 251 /* File is mmap'd by somebody? */ 252 return mapping_mapped(mapping); 253} 254 255static inline int is_page_cache_freeable(struct page *page) 256{ 257 return page_count(page) - !!PagePrivate(page) == 2; 258} 259 260static int may_write_to_queue(struct backing_dev_info *bdi) 261{ 262 if (current->flags & PF_SWAPWRITE) 263 return 1; 264 if (!bdi_write_congested(bdi)) 265 return 1; 266 if (bdi == current->backing_dev_info) 267 return 1; 268 return 0; 269} 270 271/* 272 * We detected a synchronous write error writing a page out. Probably 273 * -ENOSPC. We need to propagate that into the address_space for a subsequent 274 * fsync(), msync() or close(). 275 * 276 * The tricky part is that after writepage we cannot touch the mapping: nothing 277 * prevents it from being freed up. But we have a ref on the page and once 278 * that page is locked, the mapping is pinned. 279 * 280 * We're allowed to run sleeping lock_page() here because we know the caller has 281 * __GFP_FS. 282 */ 283static void handle_write_error(struct address_space *mapping, 284 struct page *page, int error) 285{ 286 lock_page(page); 287 if (page_mapping(page) == mapping) { 288 if (error == -ENOSPC) 289 set_bit(AS_ENOSPC, &mapping->flags); 290 else 291 set_bit(AS_EIO, &mapping->flags); 292 } 293 unlock_page(page); 294} 295 296/* possible outcome of pageout() */ 297typedef enum { 298 /* failed to write page out, page is locked */ 299 PAGE_KEEP, 300 /* move page to the active list, page is locked */ 301 PAGE_ACTIVATE, 302 /* page has been sent to the disk successfully, page is unlocked */ 303 PAGE_SUCCESS, 304 /* page is clean and locked */ 305 PAGE_CLEAN, 306} pageout_t; 307 308/* 309 * pageout is called by shrink_page_list() for each dirty page. 310 * Calls ->writepage(). 311 */ 312static pageout_t pageout(struct page *page, struct address_space *mapping) 313{ 314 /* 315 * If the page is dirty, only perform writeback if that write 316 * will be non-blocking. To prevent this allocation from being 317 * stalled by pagecache activity. But note that there may be 318 * stalls if we need to run get_block(). We could test 319 * PagePrivate for that. 320 * 321 * If this process is currently in generic_file_write() against 322 * this page's queue, we can perform writeback even if that 323 * will block. 324 * 325 * If the page is swapcache, write it back even if that would 326 * block, for some throttling. This happens by accident, because 327 * swap_backing_dev_info is bust: it doesn't reflect the 328 * congestion state of the swapdevs. Easy to fix, if needed. 329 * See swapfile.c:page_queue_congested(). 330 */ 331 if (!is_page_cache_freeable(page)) 332 return PAGE_KEEP; 333 if (!mapping) { 334 /* 335 * Some data journaling orphaned pages can have 336 * page->mapping == NULL while being dirty with clean buffers. 337 */ 338 if (PagePrivate(page)) { 339 if (try_to_free_buffers(page)) { 340 ClearPageDirty(page); 341 printk("%s: orphaned page\n", __FUNCTION__); 342 return PAGE_CLEAN; 343 } 344 } 345 return PAGE_KEEP; 346 } 347 if (mapping->a_ops->writepage == NULL) 348 return PAGE_ACTIVATE; 349 if (!may_write_to_queue(mapping->backing_dev_info)) 350 return PAGE_KEEP; 351 352 if (clear_page_dirty_for_io(page)) { 353 int res; 354 struct writeback_control wbc = { 355 .sync_mode = WB_SYNC_NONE, 356 .nr_to_write = SWAP_CLUSTER_MAX, 357 .range_start = 0, 358 .range_end = LLONG_MAX, 359 .nonblocking = 1, 360 .for_reclaim = 1, 361 }; 362 363 SetPageReclaim(page); 364 res = mapping->a_ops->writepage(page, &wbc); 365 if (res < 0) 366 handle_write_error(mapping, page, res); 367 if (res == AOP_WRITEPAGE_ACTIVATE) { 368 ClearPageReclaim(page); 369 return PAGE_ACTIVATE; 370 } 371 if (!PageWriteback(page)) { 372 /* synchronous write or broken a_ops? */ 373 ClearPageReclaim(page); 374 } 375 inc_zone_page_state(page, NR_VMSCAN_WRITE); 376 return PAGE_SUCCESS; 377 } 378 379 return PAGE_CLEAN; 380} 381 382/* 383 * Attempt to detach a locked page from its ->mapping. If it is dirty or if 384 * someone else has a ref on the page, abort and return 0. If it was 385 * successfully detached, return 1. Assumes the caller has a single ref on 386 * this page. 387 */ 388int remove_mapping(struct address_space *mapping, struct page *page) 389{ 390 BUG_ON(!PageLocked(page)); 391 BUG_ON(mapping != page_mapping(page)); 392 393 write_lock_irq(&mapping->tree_lock); 394 /* 395 * The non racy check for a busy page. 396 * 397 * Must be careful with the order of the tests. When someone has 398 * a ref to the page, it may be possible that they dirty it then 399 * drop the reference. So if PageDirty is tested before page_count 400 * here, then the following race may occur: 401 * 402 * get_user_pages(&page); 403 * [user mapping goes away] 404 * write_to(page); 405 * !PageDirty(page) [good] 406 * SetPageDirty(page); 407 * put_page(page); 408 * !page_count(page) [good, discard it] 409 * 410 * [oops, our write_to data is lost] 411 * 412 * Reversing the order of the tests ensures such a situation cannot 413 * escape unnoticed. The smp_rmb is needed to ensure the page->flags 414 * load is not satisfied before that of page->_count. 415 * 416 * Note that if SetPageDirty is always performed via set_page_dirty, 417 * and thus under tree_lock, then this ordering is not required. 418 */ 419 if (unlikely(page_count(page) != 2)) 420 goto cannot_free; 421 smp_rmb(); 422 if (unlikely(PageDirty(page))) 423 goto cannot_free; 424 425 if (PageSwapCache(page)) { 426 swp_entry_t swap = { .val = page_private(page) }; 427 __delete_from_swap_cache(page); 428 write_unlock_irq(&mapping->tree_lock); 429 swap_free(swap); 430 __put_page(page); /* The pagecache ref */ 431 return 1; 432 } 433 434 __remove_from_page_cache(page); 435 write_unlock_irq(&mapping->tree_lock); 436 __put_page(page); 437 return 1; 438 439cannot_free: 440 write_unlock_irq(&mapping->tree_lock); 441 return 0; 442} 443 444/* 445 * shrink_page_list() returns the number of reclaimed pages 446 */ 447static unsigned long shrink_page_list(struct list_head *page_list, 448 struct scan_control *sc) 449{ 450 LIST_HEAD(ret_pages); 451 struct pagevec freed_pvec; 452 int pgactivate = 0; 453 unsigned long nr_reclaimed = 0; 454 455 cond_resched(); 456 457 pagevec_init(&freed_pvec, 1); 458 while (!list_empty(page_list)) { 459 struct address_space *mapping; 460 struct page *page; 461 int may_enter_fs; 462 int referenced; 463 464 cond_resched(); 465 466 page = lru_to_page(page_list); 467 list_del(&page->lru); 468 469 if (TestSetPageLocked(page)) 470 goto keep; 471 472 VM_BUG_ON(PageActive(page)); 473 474 sc->nr_scanned++; 475 476 if (!sc->may_swap && page_mapped(page)) 477 goto keep_locked; 478 479 /* Double the slab pressure for mapped and swapcache pages */ 480 if (page_mapped(page) || PageSwapCache(page)) 481 sc->nr_scanned++; 482 483 if (PageWriteback(page)) 484 goto keep_locked; 485 486 referenced = page_referenced(page, 1); 487 /* In active use or really unfreeable? Activate it. */ 488 if (referenced && page_mapping_inuse(page)) 489 goto activate_locked; 490 491#ifdef CONFIG_SWAP 492 /* 493 * Anonymous process memory has backing store? 494 * Try to allocate it some swap space here. 495 */ 496 if (PageAnon(page) && !PageSwapCache(page)) 497 if (!add_to_swap(page, GFP_ATOMIC)) 498 goto activate_locked; 499#endif /* CONFIG_SWAP */ 500 501 mapping = page_mapping(page); 502 may_enter_fs = (sc->gfp_mask & __GFP_FS) || 503 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 504 505 /* 506 * The page is mapped into the page tables of one or more 507 * processes. Try to unmap it here. 508 */ 509 if (page_mapped(page) && mapping) { 510 switch (try_to_unmap(page, 0)) { 511 case SWAP_FAIL: 512 goto activate_locked; 513 case SWAP_AGAIN: 514 goto keep_locked; 515 case SWAP_SUCCESS: 516 ; /* try to free the page below */ 517 } 518 } 519 520 if (PageDirty(page)) { 521 if (referenced) 522 goto keep_locked; 523 if (!may_enter_fs) 524 goto keep_locked; 525 if (!sc->may_writepage) 526 goto keep_locked; 527 528 /* Page is dirty, try to write it out here */ 529 switch(pageout(page, mapping)) { 530 case PAGE_KEEP: 531 goto keep_locked; 532 case PAGE_ACTIVATE: 533 goto activate_locked; 534 case PAGE_SUCCESS: 535 if (PageWriteback(page) || PageDirty(page)) 536 goto keep; 537 /* 538 * A synchronous write - probably a ramdisk. Go 539 * ahead and try to reclaim the page. 540 */ 541 if (TestSetPageLocked(page)) 542 goto keep; 543 if (PageDirty(page) || PageWriteback(page)) 544 goto keep_locked; 545 mapping = page_mapping(page); 546 case PAGE_CLEAN: 547 ; /* try to free the page below */ 548 } 549 } 550 551 /* 552 * If the page has buffers, try to free the buffer mappings 553 * associated with this page. If we succeed we try to free 554 * the page as well. 555 * 556 * We do this even if the page is PageDirty(). 557 * try_to_release_page() does not perform I/O, but it is 558 * possible for a page to have PageDirty set, but it is actually 559 * clean (all its buffers are clean). This happens if the 560 * buffers were written out directly, with submit_bh(). ext3 561 * will do this, as well as the blockdev mapping. 562 * try_to_release_page() will discover that cleanness and will 563 * drop the buffers and mark the page clean - it can be freed. 564 * 565 * Rarely, pages can have buffers and no ->mapping. These are 566 * the pages which were not successfully invalidated in 567 * truncate_complete_page(). We try to drop those buffers here 568 * and if that worked, and the page is no longer mapped into 569 * process address space (page_count == 1) it can be freed. 570 * Otherwise, leave the page on the LRU so it is swappable. 571 */ 572 if (PagePrivate(page)) { 573 if (!try_to_release_page(page, sc->gfp_mask)) 574 goto activate_locked; 575 if (!mapping && page_count(page) == 1) 576 goto free_it; 577 } 578 579 if (!mapping || !remove_mapping(mapping, page)) 580 goto keep_locked; 581 582free_it: 583 unlock_page(page); 584 nr_reclaimed++; 585 if (!pagevec_add(&freed_pvec, page)) 586 __pagevec_release_nonlru(&freed_pvec); 587 continue; 588 589activate_locked: 590 SetPageActive(page); 591 pgactivate++; 592keep_locked: 593 unlock_page(page); 594keep: 595 list_add(&page->lru, &ret_pages); 596 VM_BUG_ON(PageLRU(page)); 597 } 598 list_splice(&ret_pages, page_list); 599 if (pagevec_count(&freed_pvec)) 600 __pagevec_release_nonlru(&freed_pvec); 601 count_vm_events(PGACTIVATE, pgactivate); 602 return nr_reclaimed; 603} 604 605/* 606 * zone->lru_lock is heavily contended. Some of the functions that 607 * shrink the lists perform better by taking out a batch of pages 608 * and working on them outside the LRU lock. 609 * 610 * For pagecache intensive workloads, this function is the hottest 611 * spot in the kernel (apart from copy_*_user functions). 612 * 613 * Appropriate locks must be held before calling this function. 614 * 615 * @nr_to_scan: The number of pages to look through on the list. 616 * @src: The LRU list to pull pages off. 617 * @dst: The temp list to put pages on to. 618 * @scanned: The number of pages that were scanned. 619 * 620 * returns how many pages were moved onto *@dst. 621 */ 622static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 623 struct list_head *src, struct list_head *dst, 624 unsigned long *scanned) 625{ 626 unsigned long nr_taken = 0; 627 struct page *page; 628 unsigned long scan; 629 630 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 631 struct list_head *target; 632 page = lru_to_page(src); 633 prefetchw_prev_lru_page(page, src, flags); 634 635 VM_BUG_ON(!PageLRU(page)); 636 637 list_del(&page->lru); 638 target = src; 639 if (likely(get_page_unless_zero(page))) { 640 /* 641 * Be careful not to clear PageLRU until after we're 642 * sure the page is not being freed elsewhere -- the 643 * page release code relies on it. 644 */ 645 ClearPageLRU(page); 646 target = dst; 647 nr_taken++; 648 } /* else it is being freed elsewhere */ 649 650 list_add(&page->lru, target); 651 } 652 653 *scanned = scan; 654 return nr_taken; 655} 656 657/* 658 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 659 * of reclaimed pages 660 */ 661static unsigned long shrink_inactive_list(unsigned long max_scan, 662 struct zone *zone, struct scan_control *sc) 663{ 664 LIST_HEAD(page_list); 665 struct pagevec pvec; 666 unsigned long nr_scanned = 0; 667 unsigned long nr_reclaimed = 0; 668 669 pagevec_init(&pvec, 1); 670 671 lru_add_drain(); 672 spin_lock_irq(&zone->lru_lock); 673 do { 674 struct page *page; 675 unsigned long nr_taken; 676 unsigned long nr_scan; 677 unsigned long nr_freed; 678 679 nr_taken = isolate_lru_pages(sc->swap_cluster_max, 680 &zone->inactive_list, 681 &page_list, &nr_scan); 682 zone->nr_inactive -= nr_taken; 683 zone->pages_scanned += nr_scan; 684 spin_unlock_irq(&zone->lru_lock); 685 686 nr_scanned += nr_scan; 687 nr_freed = shrink_page_list(&page_list, sc); 688 nr_reclaimed += nr_freed; 689 local_irq_disable(); 690 if (current_is_kswapd()) { 691 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); 692 __count_vm_events(KSWAPD_STEAL, nr_freed); 693 } else 694 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); 695 __count_zone_vm_events(PGSTEAL, zone, nr_freed); 696 697 if (nr_taken == 0) 698 goto done; 699 700 spin_lock(&zone->lru_lock); 701 /* 702 * Put back any unfreeable pages. 703 */ 704 while (!list_empty(&page_list)) { 705 page = lru_to_page(&page_list); 706 VM_BUG_ON(PageLRU(page)); 707 SetPageLRU(page); 708 list_del(&page->lru); 709 if (PageActive(page)) 710 add_page_to_active_list(zone, page); 711 else 712 add_page_to_inactive_list(zone, page); 713 if (!pagevec_add(&pvec, page)) { 714 spin_unlock_irq(&zone->lru_lock); 715 __pagevec_release(&pvec); 716 spin_lock_irq(&zone->lru_lock); 717 } 718 } 719 } while (nr_scanned < max_scan); 720 spin_unlock(&zone->lru_lock); 721done: 722 local_irq_enable(); 723 pagevec_release(&pvec); 724 return nr_reclaimed; 725} 726 727/* 728 * We are about to scan this zone at a certain priority level. If that priority 729 * level is smaller (ie: more urgent) than the previous priority, then note 730 * that priority level within the zone. This is done so that when the next 731 * process comes in to scan this zone, it will immediately start out at this 732 * priority level rather than having to build up its own scanning priority. 733 * Here, this priority affects only the reclaim-mapped threshold. 734 */ 735static inline void note_zone_scanning_priority(struct zone *zone, int priority) 736{ 737 if (priority < zone->prev_priority) 738 zone->prev_priority = priority; 739} 740 741static inline int zone_is_near_oom(struct zone *zone) 742{ 743 return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3; 744} 745 746/* 747 * This moves pages from the active list to the inactive list. 748 * 749 * We move them the other way if the page is referenced by one or more 750 * processes, from rmap. 751 * 752 * If the pages are mostly unmapped, the processing is fast and it is 753 * appropriate to hold zone->lru_lock across the whole operation. But if 754 * the pages are mapped, the processing is slow (page_referenced()) so we 755 * should drop zone->lru_lock around each page. It's impossible to balance 756 * this, so instead we remove the pages from the LRU while processing them. 757 * It is safe to rely on PG_active against the non-LRU pages in here because 758 * nobody will play with that bit on a non-LRU page. 759 * 760 * The downside is that we have to touch page->_count against each page. 761 * But we had to alter page->flags anyway. 762 */ 763static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 764 struct scan_control *sc, int priority) 765{ 766 unsigned long pgmoved; 767 int pgdeactivate = 0; 768 unsigned long pgscanned; 769 LIST_HEAD(l_hold); /* The pages which were snipped off */ 770 LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ 771 LIST_HEAD(l_active); /* Pages to go onto the active_list */ 772 struct page *page; 773 struct pagevec pvec; 774 int reclaim_mapped = 0; 775 776 if (sc->may_swap) { 777 long mapped_ratio; 778 long distress; 779 long swap_tendency; 780 781 if (zone_is_near_oom(zone)) 782 goto force_reclaim_mapped; 783 784 /* 785 * `distress' is a measure of how much trouble we're having 786 * reclaiming pages. 0 -> no problems. 100 -> great trouble. 787 */ 788 distress = 100 >> min(zone->prev_priority, priority); 789 790 /* 791 * The point of this algorithm is to decide when to start 792 * reclaiming mapped memory instead of just pagecache. Work out 793 * how much memory 794 * is mapped. 795 */ 796 mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + 797 global_page_state(NR_ANON_PAGES)) * 100) / 798 vm_total_pages; 799 800 /* 801 * Now decide how much we really want to unmap some pages. The 802 * mapped ratio is downgraded - just because there's a lot of 803 * mapped memory doesn't necessarily mean that page reclaim 804 * isn't succeeding. 805 * 806 * The distress ratio is important - we don't want to start 807 * going oom. 808 * 809 * A 100% value of vm_swappiness overrides this algorithm 810 * altogether. 811 */ 812 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; 813 814 /* 815 * Now use this metric to decide whether to start moving mapped 816 * memory onto the inactive list. 817 */ 818 if (swap_tendency >= 100) 819force_reclaim_mapped: 820 reclaim_mapped = 1; 821 } 822 823 lru_add_drain(); 824 spin_lock_irq(&zone->lru_lock); 825 pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, 826 &l_hold, &pgscanned); 827 zone->pages_scanned += pgscanned; 828 zone->nr_active -= pgmoved; 829 spin_unlock_irq(&zone->lru_lock); 830 831 while (!list_empty(&l_hold)) { 832 cond_resched(); 833 page = lru_to_page(&l_hold); 834 list_del(&page->lru); 835 if (page_mapped(page)) { 836 if (!reclaim_mapped || 837 (total_swap_pages == 0 && PageAnon(page)) || 838 page_referenced(page, 0)) { 839 list_add(&page->lru, &l_active); 840 continue; 841 } 842 } 843 list_add(&page->lru, &l_inactive); 844 } 845 846 pagevec_init(&pvec, 1); 847 pgmoved = 0; 848 spin_lock_irq(&zone->lru_lock); 849 while (!list_empty(&l_inactive)) { 850 page = lru_to_page(&l_inactive); 851 prefetchw_prev_lru_page(page, &l_inactive, flags); 852 VM_BUG_ON(PageLRU(page)); 853 SetPageLRU(page); 854 VM_BUG_ON(!PageActive(page)); 855 ClearPageActive(page); 856 857 list_move(&page->lru, &zone->inactive_list); 858 pgmoved++; 859 if (!pagevec_add(&pvec, page)) { 860 zone->nr_inactive += pgmoved; 861 spin_unlock_irq(&zone->lru_lock); 862 pgdeactivate += pgmoved; 863 pgmoved = 0; 864 if (buffer_heads_over_limit) 865 pagevec_strip(&pvec); 866 __pagevec_release(&pvec); 867 spin_lock_irq(&zone->lru_lock); 868 } 869 } 870 zone->nr_inactive += pgmoved; 871 pgdeactivate += pgmoved; 872 if (buffer_heads_over_limit) { 873 spin_unlock_irq(&zone->lru_lock); 874 pagevec_strip(&pvec); 875 spin_lock_irq(&zone->lru_lock); 876 } 877 878 pgmoved = 0; 879 while (!list_empty(&l_active)) { 880 page = lru_to_page(&l_active); 881 prefetchw_prev_lru_page(page, &l_active, flags); 882 VM_BUG_ON(PageLRU(page)); 883 SetPageLRU(page); 884 VM_BUG_ON(!PageActive(page)); 885 list_move(&page->lru, &zone->active_list); 886 pgmoved++; 887 if (!pagevec_add(&pvec, page)) { 888 zone->nr_active += pgmoved; 889 pgmoved = 0; 890 spin_unlock_irq(&zone->lru_lock); 891 __pagevec_release(&pvec); 892 spin_lock_irq(&zone->lru_lock); 893 } 894 } 895 zone->nr_active += pgmoved; 896 897 __count_zone_vm_events(PGREFILL, zone, pgscanned); 898 __count_vm_events(PGDEACTIVATE, pgdeactivate); 899 spin_unlock_irq(&zone->lru_lock); 900 901 pagevec_release(&pvec); 902} 903 904/* 905 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 906 */ 907static unsigned long shrink_zone(int priority, struct zone *zone, 908 struct scan_control *sc) 909{ 910 unsigned long nr_active; 911 unsigned long nr_inactive; 912 unsigned long nr_to_scan; 913 unsigned long nr_reclaimed = 0; 914 915 atomic_inc(&zone->reclaim_in_progress); 916 917 /* 918 * Add one to `nr_to_scan' just to make sure that the kernel will 919 * slowly sift through the active list. 920 */ 921 zone->nr_scan_active += (zone->nr_active >> priority) + 1; 922 nr_active = zone->nr_scan_active; 923 if (nr_active >= sc->swap_cluster_max) 924 zone->nr_scan_active = 0; 925 else 926 nr_active = 0; 927 928 zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1; 929 nr_inactive = zone->nr_scan_inactive; 930 if (nr_inactive >= sc->swap_cluster_max) 931 zone->nr_scan_inactive = 0; 932 else 933 nr_inactive = 0; 934 935 while (nr_active || nr_inactive) { 936 if (nr_active) { 937 nr_to_scan = min(nr_active, 938 (unsigned long)sc->swap_cluster_max); 939 nr_active -= nr_to_scan; 940 shrink_active_list(nr_to_scan, zone, sc, priority); 941 } 942 943 if (nr_inactive) { 944 nr_to_scan = min(nr_inactive, 945 (unsigned long)sc->swap_cluster_max); 946 nr_inactive -= nr_to_scan; 947 nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, 948 sc); 949 } 950 } 951 952 throttle_vm_writeout(); 953 954 atomic_dec(&zone->reclaim_in_progress); 955 return nr_reclaimed; 956} 957 958/* 959 * This is the direct reclaim path, for page-allocating processes. We only 960 * try to reclaim pages from zones which will satisfy the caller's allocation 961 * request. 962 * 963 * We reclaim from a zone even if that zone is over pages_high. Because: 964 * a) The caller may be trying to free *extra* pages to satisfy a higher-order 965 * allocation or 966 * b) The zones may be over pages_high but they must go *over* pages_high to 967 * satisfy the `incremental min' zone defense algorithm. 968 * 969 * Returns the number of reclaimed pages. 970 * 971 * If a zone is deemed to be full of pinned pages then just give it a light 972 * scan then give up on it. 973 */ 974static unsigned long shrink_zones(int priority, struct zone **zones, 975 struct scan_control *sc) 976{ 977 unsigned long nr_reclaimed = 0; 978 int i; 979 980 sc->all_unreclaimable = 1; 981 for (i = 0; zones[i] != NULL; i++) { 982 struct zone *zone = zones[i]; 983 984 if (!populated_zone(zone)) 985 continue; 986 987 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 988 continue; 989 990 note_zone_scanning_priority(zone, priority); 991 992 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 993 continue; /* Let kswapd poll it */ 994 995 sc->all_unreclaimable = 0; 996 997 nr_reclaimed += shrink_zone(priority, zone, sc); 998 } 999 return nr_reclaimed; 1000} 1001 1002/* 1003 * This is the main entry point to direct page reclaim. 1004 * 1005 * If a full scan of the inactive list fails to free enough memory then we 1006 * are "out of memory" and something needs to be killed. 1007 * 1008 * If the caller is !__GFP_FS then the probability of a failure is reasonably 1009 * high - the zone may be full of dirty or under-writeback pages, which this 1010 * caller can't do much about. We kick pdflush and take explicit naps in the 1011 * hope that some of these pages can be written. But if the allocating task 1012 * holds filesystem locks which prevent writeout this might not work, and the 1013 * allocation attempt will fail. 1014 */ 1015unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) 1016{ 1017 int priority; 1018 int ret = 0; 1019 unsigned long total_scanned = 0; 1020 unsigned long nr_reclaimed = 0; 1021 struct reclaim_state *reclaim_state = current->reclaim_state; 1022 unsigned long lru_pages = 0; 1023 int i; 1024 struct scan_control sc = { 1025 .gfp_mask = gfp_mask, 1026 .may_writepage = !laptop_mode, 1027 .swap_cluster_max = SWAP_CLUSTER_MAX, 1028 .may_swap = 1, 1029 .swappiness = vm_swappiness, 1030 }; 1031 1032 count_vm_event(ALLOCSTALL); 1033 1034 for (i = 0; zones[i] != NULL; i++) { 1035 struct zone *zone = zones[i]; 1036 1037 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1038 continue; 1039 1040 lru_pages += zone->nr_active + zone->nr_inactive; 1041 } 1042 1043 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1044 sc.nr_scanned = 0; 1045 if (!priority) 1046 disable_swap_token(); 1047 nr_reclaimed += shrink_zones(priority, zones, &sc); 1048 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); 1049 if (reclaim_state) { 1050 nr_reclaimed += reclaim_state->reclaimed_slab; 1051 reclaim_state->reclaimed_slab = 0; 1052 } 1053 total_scanned += sc.nr_scanned; 1054 if (nr_reclaimed >= sc.swap_cluster_max) { 1055 ret = 1; 1056 goto out; 1057 } 1058 1059 /* 1060 * Try to write back as many pages as we just scanned. This 1061 * tends to cause slow streaming writers to write data to the 1062 * disk smoothly, at the dirtying rate, which is nice. But 1063 * that's undesirable in laptop mode, where we *want* lumpy 1064 * writeout. So in laptop mode, write out the whole world. 1065 */ 1066 if (total_scanned > sc.swap_cluster_max + 1067 sc.swap_cluster_max / 2) { 1068 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1069 sc.may_writepage = 1; 1070 } 1071 1072 /* Take a nap, wait for some writeback to complete */ 1073 if (sc.nr_scanned && priority < DEF_PRIORITY - 2) 1074 congestion_wait(WRITE, HZ/10); 1075 } 1076 /* top priority shrink_caches still had more to do? don't OOM, then */ 1077 if (!sc.all_unreclaimable) 1078 ret = 1; 1079out: 1080 /* 1081 * Now that we've scanned all the zones at this priority level, note 1082 * that level within the zone so that the next thread which performs 1083 * scanning of this zone will immediately start out at this priority 1084 * level. This affects only the decision whether or not to bring 1085 * mapped pages onto the inactive list. 1086 */ 1087 if (priority < 0) 1088 priority = 0; 1089 for (i = 0; zones[i] != 0; i++) { 1090 struct zone *zone = zones[i]; 1091 1092 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1093 continue; 1094 1095 zone->prev_priority = priority; 1096 } 1097 return ret; 1098} 1099 1100/* 1101 * For kswapd, balance_pgdat() will work across all this node's zones until 1102 * they are all at pages_high. 1103 * 1104 * Returns the number of pages which were actually freed. 1105 * 1106 * There is special handling here for zones which are full of pinned pages. 1107 * This can happen if the pages are all mlocked, or if they are all used by 1108 * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. 1109 * What we do is to detect the case where all pages in the zone have been 1110 * scanned twice and there has been zero successful reclaim. Mark the zone as 1111 * dead and from now on, only perform a short scan. Basically we're polling 1112 * the zone for when the problem goes away. 1113 * 1114 * kswapd scans the zones in the highmem->normal->dma direction. It skips 1115 * zones which have free_pages > pages_high, but once a zone is found to have 1116 * free_pages <= pages_high, we scan that zone and the lower zones regardless 1117 * of the number of free pages in the lower zones. This interoperates with 1118 * the page allocator fallback scheme to ensure that aging of pages is balanced 1119 * across the zones. 1120 */ 1121static unsigned long balance_pgdat(pg_data_t *pgdat, int order) 1122{ 1123 int all_zones_ok; 1124 int priority; 1125 int i; 1126 unsigned long total_scanned; 1127 unsigned long nr_reclaimed; 1128 struct reclaim_state *reclaim_state = current->reclaim_state; 1129 struct scan_control sc = { 1130 .gfp_mask = GFP_KERNEL, 1131 .may_swap = 1, 1132 .swap_cluster_max = SWAP_CLUSTER_MAX, 1133 .swappiness = vm_swappiness, 1134 }; 1135 /* 1136 * temp_priority is used to remember the scanning priority at which 1137 * this zone was successfully refilled to free_pages == pages_high. 1138 */ 1139 int temp_priority[MAX_NR_ZONES]; 1140 1141loop_again: 1142 total_scanned = 0; 1143 nr_reclaimed = 0; 1144 sc.may_writepage = !laptop_mode; 1145 count_vm_event(PAGEOUTRUN); 1146 1147 for (i = 0; i < pgdat->nr_zones; i++) 1148 temp_priority[i] = DEF_PRIORITY; 1149 1150 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1151 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 1152 unsigned long lru_pages = 0; 1153 1154 /* The swap token gets in the way of swapout... */ 1155 if (!priority) 1156 disable_swap_token(); 1157 1158 all_zones_ok = 1; 1159 1160 /* 1161 * Scan in the highmem->dma direction for the highest 1162 * zone which needs scanning 1163 */ 1164 for (i = pgdat->nr_zones - 1; i >= 0; i--) { 1165 struct zone *zone = pgdat->node_zones + i; 1166 1167 if (!populated_zone(zone)) 1168 continue; 1169 1170 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1171 continue; 1172 1173 if (!zone_watermark_ok(zone, order, zone->pages_high, 1174 0, 0)) { 1175 end_zone = i; 1176 break; 1177 } 1178 } 1179 if (i < 0) 1180 goto out; 1181 1182 for (i = 0; i <= end_zone; i++) { 1183 struct zone *zone = pgdat->node_zones + i; 1184 1185 lru_pages += zone->nr_active + zone->nr_inactive; 1186 } 1187 1188 /* 1189 * Now scan the zone in the dma->highmem direction, stopping 1190 * at the last zone which needs scanning. 1191 * 1192 * We do this because the page allocator works in the opposite 1193 * direction. This prevents the page allocator from allocating 1194 * pages behind kswapd's direction of progress, which would 1195 * cause too much scanning of the lower zones. 1196 */ 1197 for (i = 0; i <= end_zone; i++) { 1198 struct zone *zone = pgdat->node_zones + i; 1199 int nr_slab; 1200 1201 if (!populated_zone(zone)) 1202 continue; 1203 1204 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1205 continue; 1206 1207 if (!zone_watermark_ok(zone, order, zone->pages_high, 1208 end_zone, 0)) 1209 all_zones_ok = 0; 1210 temp_priority[i] = priority; 1211 sc.nr_scanned = 0; 1212 note_zone_scanning_priority(zone, priority); 1213 nr_reclaimed += shrink_zone(priority, zone, &sc); 1214 reclaim_state->reclaimed_slab = 0; 1215 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1216 lru_pages); 1217 nr_reclaimed += reclaim_state->reclaimed_slab; 1218 total_scanned += sc.nr_scanned; 1219 if (zone->all_unreclaimable) 1220 continue; 1221 if (nr_slab == 0 && zone->pages_scanned >= 1222 (zone->nr_active + zone->nr_inactive) * 6) 1223 zone->all_unreclaimable = 1; 1224 /* 1225 * If we've done a decent amount of scanning and 1226 * the reclaim ratio is low, start doing writepage 1227 * even in laptop mode 1228 */ 1229 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 1230 total_scanned > nr_reclaimed + nr_reclaimed / 2) 1231 sc.may_writepage = 1; 1232 } 1233 if (all_zones_ok) 1234 break; /* kswapd: all done */ 1235 /* 1236 * OK, kswapd is getting into trouble. Take a nap, then take 1237 * another pass across the zones. 1238 */ 1239 if (total_scanned && priority < DEF_PRIORITY - 2) 1240 congestion_wait(WRITE, HZ/10); 1241 1242 /* 1243 * We do this so kswapd doesn't build up large priorities for 1244 * example when it is freeing in parallel with allocators. It 1245 * matches the direct reclaim path behaviour in terms of impact 1246 * on zone->*_priority. 1247 */ 1248 if (nr_reclaimed >= SWAP_CLUSTER_MAX) 1249 break; 1250 } 1251out: 1252 /* 1253 * Note within each zone the priority level at which this zone was 1254 * brought into a happy state. So that the next thread which scans this 1255 * zone will start out at that priority level. 1256 */ 1257 for (i = 0; i < pgdat->nr_zones; i++) { 1258 struct zone *zone = pgdat->node_zones + i; 1259 1260 zone->prev_priority = temp_priority[i]; 1261 } 1262 if (!all_zones_ok) { 1263 cond_resched(); 1264 1265 try_to_freeze(); 1266 1267 goto loop_again; 1268 } 1269 1270 return nr_reclaimed; 1271} 1272 1273/* 1274 * The background pageout daemon, started as a kernel thread 1275 * from the init process. 1276 * 1277 * This basically trickles out pages so that we have _some_ 1278 * free memory available even if there is no other activity 1279 * that frees anything up. This is needed for things like routing 1280 * etc, where we otherwise might have all activity going on in 1281 * asynchronous contexts that cannot page things out. 1282 * 1283 * If there are applications that are active memory-allocators 1284 * (most normal use), this basically shouldn't matter. 1285 */ 1286static int kswapd(void *p) 1287{ 1288 unsigned long order; 1289 pg_data_t *pgdat = (pg_data_t*)p; 1290 struct task_struct *tsk = current; 1291 DEFINE_WAIT(wait); 1292 struct reclaim_state reclaim_state = { 1293 .reclaimed_slab = 0, 1294 }; 1295 cpumask_t cpumask; 1296 1297 cpumask = node_to_cpumask(pgdat->node_id); 1298 if (!cpus_empty(cpumask)) 1299 set_cpus_allowed(tsk, cpumask); 1300 current->reclaim_state = &reclaim_state; 1301 1302 /* 1303 * Tell the memory management that we're a "memory allocator", 1304 * and that if we need more memory we should get access to it 1305 * regardless (see "__alloc_pages()"). "kswapd" should 1306 * never get caught in the normal page freeing logic. 1307 * 1308 * (Kswapd normally doesn't need memory anyway, but sometimes 1309 * you need a small amount of memory in order to be able to 1310 * page out something else, and this flag essentially protects 1311 * us from recursively trying to free more memory as we're 1312 * trying to free the first piece of memory in the first place). 1313 */ 1314 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 1315 1316 order = 0; 1317 for ( ; ; ) { 1318 unsigned long new_order; 1319 1320 try_to_freeze(); 1321 1322 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 1323 new_order = pgdat->kswapd_max_order; 1324 pgdat->kswapd_max_order = 0; 1325 if (order < new_order) { 1326 /* 1327 * Don't sleep if someone wants a larger 'order' 1328 * allocation 1329 */ 1330 order = new_order; 1331 } else { 1332 schedule(); 1333 order = pgdat->kswapd_max_order; 1334 } 1335 finish_wait(&pgdat->kswapd_wait, &wait); 1336 1337 balance_pgdat(pgdat, order); 1338 } 1339 return 0; 1340} 1341 1342/* 1343 * A zone is low on free memory, so wake its kswapd task to service it. 1344 */ 1345void wakeup_kswapd(struct zone *zone, int order) 1346{ 1347 pg_data_t *pgdat; 1348 1349 if (!populated_zone(zone)) 1350 return; 1351 1352 pgdat = zone->zone_pgdat; 1353 if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) 1354 return; 1355 if (pgdat->kswapd_max_order < order) 1356 pgdat->kswapd_max_order = order; 1357 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1358 return; 1359 if (!waitqueue_active(&pgdat->kswapd_wait)) 1360 return; 1361 wake_up_interruptible(&pgdat->kswapd_wait); 1362} 1363 1364#ifdef CONFIG_PM 1365/* 1366 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 1367 * from LRU lists system-wide, for given pass and priority, and returns the 1368 * number of reclaimed pages 1369 * 1370 * For pass > 3 we also try to shrink the LRU lists that contain a few pages 1371 */ 1372static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, 1373 int pass, struct scan_control *sc) 1374{ 1375 struct zone *zone; 1376 unsigned long nr_to_scan, ret = 0; 1377 1378 for_each_zone(zone) { 1379 1380 if (!populated_zone(zone)) 1381 continue; 1382 1383 if (zone->all_unreclaimable && prio != DEF_PRIORITY) 1384 continue; 1385 1386 /* For pass = 0 we don't shrink the active list */ 1387 if (pass > 0) { 1388 zone->nr_scan_active += (zone->nr_active >> prio) + 1; 1389 if (zone->nr_scan_active >= nr_pages || pass > 3) { 1390 zone->nr_scan_active = 0; 1391 nr_to_scan = min(nr_pages, zone->nr_active); 1392 shrink_active_list(nr_to_scan, zone, sc, prio); 1393 } 1394 } 1395 1396 zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1; 1397 if (zone->nr_scan_inactive >= nr_pages || pass > 3) { 1398 zone->nr_scan_inactive = 0; 1399 nr_to_scan = min(nr_pages, zone->nr_inactive); 1400 ret += shrink_inactive_list(nr_to_scan, zone, sc); 1401 if (ret >= nr_pages) 1402 return ret; 1403 } 1404 } 1405 1406 return ret; 1407} 1408 1409/* 1410 * Try to free `nr_pages' of memory, system-wide, and return the number of 1411 * freed pages. 1412 * 1413 * Rather than trying to age LRUs the aim is to preserve the overall 1414 * LRU order by reclaiming preferentially 1415 * inactive > active > active referenced > active mapped 1416 */ 1417unsigned long shrink_all_memory(unsigned long nr_pages) 1418{ 1419 unsigned long lru_pages, nr_slab; 1420 unsigned long ret = 0; 1421 int pass; 1422 struct reclaim_state reclaim_state; 1423 struct zone *zone; 1424 struct scan_control sc = { 1425 .gfp_mask = GFP_KERNEL, 1426 .may_swap = 0, 1427 .swap_cluster_max = nr_pages, 1428 .may_writepage = 1, 1429 .swappiness = vm_swappiness, 1430 }; 1431 1432 current->reclaim_state = &reclaim_state; 1433 1434 lru_pages = 0; 1435 for_each_zone(zone) 1436 lru_pages += zone->nr_active + zone->nr_inactive; 1437 1438 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); 1439 /* If slab caches are huge, it's better to hit them first */ 1440 while (nr_slab >= lru_pages) { 1441 reclaim_state.reclaimed_slab = 0; 1442 shrink_slab(nr_pages, sc.gfp_mask, lru_pages); 1443 if (!reclaim_state.reclaimed_slab) 1444 break; 1445 1446 ret += reclaim_state.reclaimed_slab; 1447 if (ret >= nr_pages) 1448 goto out; 1449 1450 nr_slab -= reclaim_state.reclaimed_slab; 1451 } 1452 1453 /* 1454 * We try to shrink LRUs in 5 passes: 1455 * 0 = Reclaim from inactive_list only 1456 * 1 = Reclaim from active list but don't reclaim mapped 1457 * 2 = 2nd pass of type 1 1458 * 3 = Reclaim mapped (normal reclaim) 1459 * 4 = 2nd pass of type 3 1460 */ 1461 for (pass = 0; pass < 5; pass++) { 1462 int prio; 1463 1464 /* Needed for shrinking slab caches later on */ 1465 if (!lru_pages) 1466 for_each_zone(zone) { 1467 lru_pages += zone->nr_active; 1468 lru_pages += zone->nr_inactive; 1469 } 1470 1471 /* Force reclaiming mapped pages in the passes #3 and #4 */ 1472 if (pass > 2) { 1473 sc.may_swap = 1; 1474 sc.swappiness = 100; 1475 } 1476 1477 for (prio = DEF_PRIORITY; prio >= 0; prio--) { 1478 unsigned long nr_to_scan = nr_pages - ret; 1479 1480 sc.nr_scanned = 0; 1481 ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); 1482 if (ret >= nr_pages) 1483 goto out; 1484 1485 reclaim_state.reclaimed_slab = 0; 1486 shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages); 1487 ret += reclaim_state.reclaimed_slab; 1488 if (ret >= nr_pages) 1489 goto out; 1490 1491 if (sc.nr_scanned && prio < DEF_PRIORITY - 2) 1492 congestion_wait(WRITE, HZ / 10); 1493 } 1494 1495 lru_pages = 0; 1496 } 1497 1498 /* 1499 * If ret = 0, we could not shrink LRUs, but there may be something 1500 * in slab caches 1501 */ 1502 if (!ret) 1503 do { 1504 reclaim_state.reclaimed_slab = 0; 1505 shrink_slab(nr_pages, sc.gfp_mask, lru_pages); 1506 ret += reclaim_state.reclaimed_slab; 1507 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); 1508 1509out: 1510 current->reclaim_state = NULL; 1511 1512 return ret; 1513} 1514#endif 1515 1516/* It's optimal to keep kswapds on the same CPUs as their memory, but 1517 not required for correctness. So if the last cpu in a node goes 1518 away, we get changed to run anywhere: as the first one comes back, 1519 restore their cpu bindings. */ 1520static int __devinit cpu_callback(struct notifier_block *nfb, 1521 unsigned long action, void *hcpu) 1522{ 1523 pg_data_t *pgdat; 1524 cpumask_t mask; 1525 1526 if (action == CPU_ONLINE) { 1527 for_each_online_pgdat(pgdat) { 1528 mask = node_to_cpumask(pgdat->node_id); 1529 if (any_online_cpu(mask) != NR_CPUS) 1530 /* One of our CPUs online: restore mask */ 1531 set_cpus_allowed(pgdat->kswapd, mask); 1532 } 1533 } 1534 return NOTIFY_OK; 1535} 1536 1537/* 1538 * This kswapd start function will be called by init and node-hot-add. 1539 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. 1540 */ 1541int kswapd_run(int nid) 1542{ 1543 pg_data_t *pgdat = NODE_DATA(nid); 1544 int ret = 0; 1545 1546 if (pgdat->kswapd) 1547 return 0; 1548 1549 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); 1550 if (IS_ERR(pgdat->kswapd)) { 1551 /* failure at boot is fatal */ 1552 BUG_ON(system_state == SYSTEM_BOOTING); 1553 printk("Failed to start kswapd on node %d\n",nid); 1554 ret = -1; 1555 } 1556 return ret; 1557} 1558 1559static int __init kswapd_init(void) 1560{ 1561 int nid; 1562 1563 swap_setup(); 1564 for_each_online_node(nid) 1565 kswapd_run(nid); 1566 hotcpu_notifier(cpu_callback, 0); 1567 return 0; 1568} 1569 1570module_init(kswapd_init) 1571 1572#ifdef CONFIG_NUMA 1573/* 1574 * Zone reclaim mode 1575 * 1576 * If non-zero call zone_reclaim when the number of free pages falls below 1577 * the watermarks. 1578 */ 1579int zone_reclaim_mode __read_mostly; 1580 1581#define RECLAIM_OFF 0 1582#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ 1583#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ 1584#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ 1585 1586/* 1587 * Priority for ZONE_RECLAIM. This determines the fraction of pages 1588 * of a node considered for each zone_reclaim. 4 scans 1/16th of 1589 * a zone. 1590 */ 1591#define ZONE_RECLAIM_PRIORITY 4 1592 1593/* 1594 * Percentage of pages in a zone that must be unmapped for zone_reclaim to 1595 * occur. 1596 */ 1597int sysctl_min_unmapped_ratio = 1; 1598 1599/* 1600 * If the number of slab pages in a zone grows beyond this percentage then 1601 * slab reclaim needs to occur. 1602 */ 1603int sysctl_min_slab_ratio = 5; 1604 1605/* 1606 * Try to free up some pages from this zone through reclaim. 1607 */ 1608static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1609{ 1610 /* Minimum pages needed in order to stay on node */ 1611 const unsigned long nr_pages = 1 << order; 1612 struct task_struct *p = current; 1613 struct reclaim_state reclaim_state; 1614 int priority; 1615 unsigned long nr_reclaimed = 0; 1616 struct scan_control sc = { 1617 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 1618 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), 1619 .swap_cluster_max = max_t(unsigned long, nr_pages, 1620 SWAP_CLUSTER_MAX), 1621 .gfp_mask = gfp_mask, 1622 .swappiness = vm_swappiness, 1623 }; 1624 unsigned long slab_reclaimable; 1625 1626 disable_swap_token(); 1627 cond_resched(); 1628 /* 1629 * We need to be able to allocate from the reserves for RECLAIM_SWAP 1630 * and we also need to be able to write out pages for RECLAIM_WRITE 1631 * and RECLAIM_SWAP. 1632 */ 1633 p->flags |= PF_MEMALLOC | PF_SWAPWRITE; 1634 reclaim_state.reclaimed_slab = 0; 1635 p->reclaim_state = &reclaim_state; 1636 1637 if (zone_page_state(zone, NR_FILE_PAGES) - 1638 zone_page_state(zone, NR_FILE_MAPPED) > 1639 zone->min_unmapped_pages) { 1640 /* 1641 * Free memory by calling shrink zone with increasing 1642 * priorities until we have enough memory freed. 1643 */ 1644 priority = ZONE_RECLAIM_PRIORITY; 1645 do { 1646 note_zone_scanning_priority(zone, priority); 1647 nr_reclaimed += shrink_zone(priority, zone, &sc); 1648 priority--; 1649 } while (priority >= 0 && nr_reclaimed < nr_pages); 1650 } 1651 1652 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 1653 if (slab_reclaimable > zone->min_slab_pages) { 1654 /* 1655 * shrink_slab() does not currently allow us to determine how 1656 * many pages were freed in this zone. So we take the current 1657 * number of slab pages and shake the slab until it is reduced 1658 * by the same nr_pages that we used for reclaiming unmapped 1659 * pages. 1660 * 1661 * Note that shrink_slab will free memory on all zones and may 1662 * take a long time. 1663 */ 1664 while (shrink_slab(sc.nr_scanned, gfp_mask, order) && 1665 zone_page_state(zone, NR_SLAB_RECLAIMABLE) > 1666 slab_reclaimable - nr_pages) 1667 ; 1668 1669 /* 1670 * Update nr_reclaimed by the number of slab pages we 1671 * reclaimed from this zone. 1672 */ 1673 nr_reclaimed += slab_reclaimable - 1674 zone_page_state(zone, NR_SLAB_RECLAIMABLE); 1675 } 1676 1677 p->reclaim_state = NULL; 1678 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 1679 return nr_reclaimed >= nr_pages; 1680} 1681 1682int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1683{ 1684 cpumask_t mask; 1685 int node_id; 1686 1687 /* 1688 * Zone reclaim reclaims unmapped file backed pages and 1689 * slab pages if we are over the defined limits. 1690 * 1691 * A small portion of unmapped file backed pages is needed for 1692 * file I/O otherwise pages read by file I/O will be immediately 1693 * thrown out if the zone is overallocated. So we do not reclaim 1694 * if less than a specified percentage of the zone is used by 1695 * unmapped file backed pages. 1696 */ 1697 if (zone_page_state(zone, NR_FILE_PAGES) - 1698 zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages 1699 && zone_page_state(zone, NR_SLAB_RECLAIMABLE) 1700 <= zone->min_slab_pages) 1701 return 0; 1702 1703 /* 1704 * Avoid concurrent zone reclaims, do not reclaim in a zone that does 1705 * not have reclaimable pages and if we should not delay the allocation 1706 * then do not scan. 1707 */ 1708 if (!(gfp_mask & __GFP_WAIT) || 1709 zone->all_unreclaimable || 1710 atomic_read(&zone->reclaim_in_progress) > 0 || 1711 (current->flags & PF_MEMALLOC)) 1712 return 0; 1713 1714 /* 1715 * Only run zone reclaim on the local zone or on zones that do not 1716 * have associated processors. This will favor the local processor 1717 * over remote processors and spread off node memory allocations 1718 * as wide as possible. 1719 */ 1720 node_id = zone_to_nid(zone); 1721 mask = node_to_cpumask(node_id); 1722 if (!cpus_empty(mask) && node_id != numa_node_id()) 1723 return 0; 1724 return __zone_reclaim(zone, gfp_mask, order); 1725} 1726#endif 1727