vmscan.c revision 232ea4d69d81169453344b7d05203425c88d973b
1/* 2 * linux/mm/vmscan.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 * 6 * Swap reorganised 29.12.95, Stephen Tweedie. 7 * kswapd added: 7.1.96 sct 8 * Removed kswapd_ctl limits, and swap out as many pages as needed 9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel. 10 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). 11 * Multiqueue VM started 5.8.00, Rik van Riel. 12 */ 13 14#include <linux/mm.h> 15#include <linux/module.h> 16#include <linux/slab.h> 17#include <linux/kernel_stat.h> 18#include <linux/swap.h> 19#include <linux/pagemap.h> 20#include <linux/init.h> 21#include <linux/highmem.h> 22#include <linux/vmstat.h> 23#include <linux/file.h> 24#include <linux/writeback.h> 25#include <linux/blkdev.h> 26#include <linux/buffer_head.h> /* for try_to_release_page(), 27 buffer_heads_over_limit */ 28#include <linux/mm_inline.h> 29#include <linux/pagevec.h> 30#include <linux/backing-dev.h> 31#include <linux/rmap.h> 32#include <linux/topology.h> 33#include <linux/cpu.h> 34#include <linux/cpuset.h> 35#include <linux/notifier.h> 36#include <linux/rwsem.h> 37#include <linux/delay.h> 38#include <linux/kthread.h> 39#include <linux/freezer.h> 40 41#include <asm/tlbflush.h> 42#include <asm/div64.h> 43 44#include <linux/swapops.h> 45 46#include "internal.h" 47 48struct scan_control { 49 /* Incremented by the number of inactive pages that were scanned */ 50 unsigned long nr_scanned; 51 52 /* This context's GFP mask */ 53 gfp_t gfp_mask; 54 55 int may_writepage; 56 57 /* Can pages be swapped as part of reclaim? */ 58 int may_swap; 59 60 /* This context's SWAP_CLUSTER_MAX. If freeing memory for 61 * suspend, we effectively ignore SWAP_CLUSTER_MAX. 62 * In this context, it doesn't matter that we scan the 63 * whole list at once. */ 64 int swap_cluster_max; 65 66 int swappiness; 67 68 int all_unreclaimable; 69}; 70 71/* 72 * The list of shrinker callbacks used by to apply pressure to 73 * ageable caches. 74 */ 75struct shrinker { 76 shrinker_t shrinker; 77 struct list_head list; 78 int seeks; /* seeks to recreate an obj */ 79 long nr; /* objs pending delete */ 80}; 81 82#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 83 84#ifdef ARCH_HAS_PREFETCH 85#define prefetch_prev_lru_page(_page, _base, _field) \ 86 do { \ 87 if ((_page)->lru.prev != _base) { \ 88 struct page *prev; \ 89 \ 90 prev = lru_to_page(&(_page->lru)); \ 91 prefetch(&prev->_field); \ 92 } \ 93 } while (0) 94#else 95#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) 96#endif 97 98#ifdef ARCH_HAS_PREFETCHW 99#define prefetchw_prev_lru_page(_page, _base, _field) \ 100 do { \ 101 if ((_page)->lru.prev != _base) { \ 102 struct page *prev; \ 103 \ 104 prev = lru_to_page(&(_page->lru)); \ 105 prefetchw(&prev->_field); \ 106 } \ 107 } while (0) 108#else 109#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) 110#endif 111 112/* 113 * From 0 .. 100. Higher means more swappy. 114 */ 115int vm_swappiness = 60; 116long vm_total_pages; /* The total number of pages which the VM controls */ 117 118static LIST_HEAD(shrinker_list); 119static DECLARE_RWSEM(shrinker_rwsem); 120 121/* 122 * Add a shrinker callback to be called from the vm 123 */ 124struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker) 125{ 126 struct shrinker *shrinker; 127 128 shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL); 129 if (shrinker) { 130 shrinker->shrinker = theshrinker; 131 shrinker->seeks = seeks; 132 shrinker->nr = 0; 133 down_write(&shrinker_rwsem); 134 list_add_tail(&shrinker->list, &shrinker_list); 135 up_write(&shrinker_rwsem); 136 } 137 return shrinker; 138} 139EXPORT_SYMBOL(set_shrinker); 140 141/* 142 * Remove one 143 */ 144void remove_shrinker(struct shrinker *shrinker) 145{ 146 down_write(&shrinker_rwsem); 147 list_del(&shrinker->list); 148 up_write(&shrinker_rwsem); 149 kfree(shrinker); 150} 151EXPORT_SYMBOL(remove_shrinker); 152 153#define SHRINK_BATCH 128 154/* 155 * Call the shrink functions to age shrinkable caches 156 * 157 * Here we assume it costs one seek to replace a lru page and that it also 158 * takes a seek to recreate a cache object. With this in mind we age equal 159 * percentages of the lru and ageable caches. This should balance the seeks 160 * generated by these structures. 161 * 162 * If the vm encounted mapped pages on the LRU it increase the pressure on 163 * slab to avoid swapping. 164 * 165 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. 166 * 167 * `lru_pages' represents the number of on-LRU pages in all the zones which 168 * are eligible for the caller's allocation attempt. It is used for balancing 169 * slab reclaim versus page reclaim. 170 * 171 * Returns the number of slab objects which we shrunk. 172 */ 173unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, 174 unsigned long lru_pages) 175{ 176 struct shrinker *shrinker; 177 unsigned long ret = 0; 178 179 if (scanned == 0) 180 scanned = SWAP_CLUSTER_MAX; 181 182 if (!down_read_trylock(&shrinker_rwsem)) 183 return 1; /* Assume we'll be able to shrink next time */ 184 185 list_for_each_entry(shrinker, &shrinker_list, list) { 186 unsigned long long delta; 187 unsigned long total_scan; 188 unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask); 189 190 delta = (4 * scanned) / shrinker->seeks; 191 delta *= max_pass; 192 do_div(delta, lru_pages + 1); 193 shrinker->nr += delta; 194 if (shrinker->nr < 0) { 195 printk(KERN_ERR "%s: nr=%ld\n", 196 __FUNCTION__, shrinker->nr); 197 shrinker->nr = max_pass; 198 } 199 200 /* 201 * Avoid risking looping forever due to too large nr value: 202 * never try to free more than twice the estimate number of 203 * freeable entries. 204 */ 205 if (shrinker->nr > max_pass * 2) 206 shrinker->nr = max_pass * 2; 207 208 total_scan = shrinker->nr; 209 shrinker->nr = 0; 210 211 while (total_scan >= SHRINK_BATCH) { 212 long this_scan = SHRINK_BATCH; 213 int shrink_ret; 214 int nr_before; 215 216 nr_before = (*shrinker->shrinker)(0, gfp_mask); 217 shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask); 218 if (shrink_ret == -1) 219 break; 220 if (shrink_ret < nr_before) 221 ret += nr_before - shrink_ret; 222 count_vm_events(SLABS_SCANNED, this_scan); 223 total_scan -= this_scan; 224 225 cond_resched(); 226 } 227 228 shrinker->nr += total_scan; 229 } 230 up_read(&shrinker_rwsem); 231 return ret; 232} 233 234/* Called without lock on whether page is mapped, so answer is unstable */ 235static inline int page_mapping_inuse(struct page *page) 236{ 237 struct address_space *mapping; 238 239 /* Page is in somebody's page tables. */ 240 if (page_mapped(page)) 241 return 1; 242 243 /* Be more reluctant to reclaim swapcache than pagecache */ 244 if (PageSwapCache(page)) 245 return 1; 246 247 mapping = page_mapping(page); 248 if (!mapping) 249 return 0; 250 251 /* File is mmap'd by somebody? */ 252 return mapping_mapped(mapping); 253} 254 255static inline int is_page_cache_freeable(struct page *page) 256{ 257 return page_count(page) - !!PagePrivate(page) == 2; 258} 259 260static int may_write_to_queue(struct backing_dev_info *bdi) 261{ 262 if (current->flags & PF_SWAPWRITE) 263 return 1; 264 if (!bdi_write_congested(bdi)) 265 return 1; 266 if (bdi == current->backing_dev_info) 267 return 1; 268 return 0; 269} 270 271/* 272 * We detected a synchronous write error writing a page out. Probably 273 * -ENOSPC. We need to propagate that into the address_space for a subsequent 274 * fsync(), msync() or close(). 275 * 276 * The tricky part is that after writepage we cannot touch the mapping: nothing 277 * prevents it from being freed up. But we have a ref on the page and once 278 * that page is locked, the mapping is pinned. 279 * 280 * We're allowed to run sleeping lock_page() here because we know the caller has 281 * __GFP_FS. 282 */ 283static void handle_write_error(struct address_space *mapping, 284 struct page *page, int error) 285{ 286 lock_page(page); 287 if (page_mapping(page) == mapping) { 288 if (error == -ENOSPC) 289 set_bit(AS_ENOSPC, &mapping->flags); 290 else 291 set_bit(AS_EIO, &mapping->flags); 292 } 293 unlock_page(page); 294} 295 296/* possible outcome of pageout() */ 297typedef enum { 298 /* failed to write page out, page is locked */ 299 PAGE_KEEP, 300 /* move page to the active list, page is locked */ 301 PAGE_ACTIVATE, 302 /* page has been sent to the disk successfully, page is unlocked */ 303 PAGE_SUCCESS, 304 /* page is clean and locked */ 305 PAGE_CLEAN, 306} pageout_t; 307 308/* 309 * pageout is called by shrink_page_list() for each dirty page. 310 * Calls ->writepage(). 311 */ 312static pageout_t pageout(struct page *page, struct address_space *mapping) 313{ 314 /* 315 * If the page is dirty, only perform writeback if that write 316 * will be non-blocking. To prevent this allocation from being 317 * stalled by pagecache activity. But note that there may be 318 * stalls if we need to run get_block(). We could test 319 * PagePrivate for that. 320 * 321 * If this process is currently in generic_file_write() against 322 * this page's queue, we can perform writeback even if that 323 * will block. 324 * 325 * If the page is swapcache, write it back even if that would 326 * block, for some throttling. This happens by accident, because 327 * swap_backing_dev_info is bust: it doesn't reflect the 328 * congestion state of the swapdevs. Easy to fix, if needed. 329 * See swapfile.c:page_queue_congested(). 330 */ 331 if (!is_page_cache_freeable(page)) 332 return PAGE_KEEP; 333 if (!mapping) { 334 /* 335 * Some data journaling orphaned pages can have 336 * page->mapping == NULL while being dirty with clean buffers. 337 */ 338 if (PagePrivate(page)) { 339 if (try_to_free_buffers(page)) { 340 ClearPageDirty(page); 341 printk("%s: orphaned page\n", __FUNCTION__); 342 return PAGE_CLEAN; 343 } 344 } 345 return PAGE_KEEP; 346 } 347 if (mapping->a_ops->writepage == NULL) 348 return PAGE_ACTIVATE; 349 if (!may_write_to_queue(mapping->backing_dev_info)) 350 return PAGE_KEEP; 351 352 if (clear_page_dirty_for_io(page)) { 353 int res; 354 struct writeback_control wbc = { 355 .sync_mode = WB_SYNC_NONE, 356 .nr_to_write = SWAP_CLUSTER_MAX, 357 .range_start = 0, 358 .range_end = LLONG_MAX, 359 .nonblocking = 1, 360 .for_reclaim = 1, 361 }; 362 363 SetPageReclaim(page); 364 res = mapping->a_ops->writepage(page, &wbc); 365 if (res < 0) 366 handle_write_error(mapping, page, res); 367 if (res == AOP_WRITEPAGE_ACTIVATE) { 368 ClearPageReclaim(page); 369 return PAGE_ACTIVATE; 370 } 371 if (!PageWriteback(page)) { 372 /* synchronous write or broken a_ops? */ 373 ClearPageReclaim(page); 374 } 375 inc_zone_page_state(page, NR_VMSCAN_WRITE); 376 return PAGE_SUCCESS; 377 } 378 379 return PAGE_CLEAN; 380} 381 382/* 383 * Attempt to detach a locked page from its ->mapping. If it is dirty or if 384 * someone else has a ref on the page, abort and return 0. If it was 385 * successfully detached, return 1. Assumes the caller has a single ref on 386 * this page. 387 */ 388int remove_mapping(struct address_space *mapping, struct page *page) 389{ 390 BUG_ON(!PageLocked(page)); 391 BUG_ON(mapping != page_mapping(page)); 392 393 write_lock_irq(&mapping->tree_lock); 394 /* 395 * The non racy check for a busy page. 396 * 397 * Must be careful with the order of the tests. When someone has 398 * a ref to the page, it may be possible that they dirty it then 399 * drop the reference. So if PageDirty is tested before page_count 400 * here, then the following race may occur: 401 * 402 * get_user_pages(&page); 403 * [user mapping goes away] 404 * write_to(page); 405 * !PageDirty(page) [good] 406 * SetPageDirty(page); 407 * put_page(page); 408 * !page_count(page) [good, discard it] 409 * 410 * [oops, our write_to data is lost] 411 * 412 * Reversing the order of the tests ensures such a situation cannot 413 * escape unnoticed. The smp_rmb is needed to ensure the page->flags 414 * load is not satisfied before that of page->_count. 415 * 416 * Note that if SetPageDirty is always performed via set_page_dirty, 417 * and thus under tree_lock, then this ordering is not required. 418 */ 419 if (unlikely(page_count(page) != 2)) 420 goto cannot_free; 421 smp_rmb(); 422 if (unlikely(PageDirty(page))) 423 goto cannot_free; 424 425 if (PageSwapCache(page)) { 426 swp_entry_t swap = { .val = page_private(page) }; 427 __delete_from_swap_cache(page); 428 write_unlock_irq(&mapping->tree_lock); 429 swap_free(swap); 430 __put_page(page); /* The pagecache ref */ 431 return 1; 432 } 433 434 __remove_from_page_cache(page); 435 write_unlock_irq(&mapping->tree_lock); 436 __put_page(page); 437 return 1; 438 439cannot_free: 440 write_unlock_irq(&mapping->tree_lock); 441 return 0; 442} 443 444/* 445 * shrink_page_list() returns the number of reclaimed pages 446 */ 447static unsigned long shrink_page_list(struct list_head *page_list, 448 struct scan_control *sc) 449{ 450 LIST_HEAD(ret_pages); 451 struct pagevec freed_pvec; 452 int pgactivate = 0; 453 unsigned long nr_reclaimed = 0; 454 455 cond_resched(); 456 457 pagevec_init(&freed_pvec, 1); 458 while (!list_empty(page_list)) { 459 struct address_space *mapping; 460 struct page *page; 461 int may_enter_fs; 462 int referenced; 463 464 cond_resched(); 465 466 page = lru_to_page(page_list); 467 list_del(&page->lru); 468 469 if (TestSetPageLocked(page)) 470 goto keep; 471 472 VM_BUG_ON(PageActive(page)); 473 474 sc->nr_scanned++; 475 476 if (!sc->may_swap && page_mapped(page)) 477 goto keep_locked; 478 479 /* Double the slab pressure for mapped and swapcache pages */ 480 if (page_mapped(page) || PageSwapCache(page)) 481 sc->nr_scanned++; 482 483 if (PageWriteback(page)) 484 goto keep_locked; 485 486 referenced = page_referenced(page, 1); 487 /* In active use or really unfreeable? Activate it. */ 488 if (referenced && page_mapping_inuse(page)) 489 goto activate_locked; 490 491#ifdef CONFIG_SWAP 492 /* 493 * Anonymous process memory has backing store? 494 * Try to allocate it some swap space here. 495 */ 496 if (PageAnon(page) && !PageSwapCache(page)) 497 if (!add_to_swap(page, GFP_ATOMIC)) 498 goto activate_locked; 499#endif /* CONFIG_SWAP */ 500 501 mapping = page_mapping(page); 502 may_enter_fs = (sc->gfp_mask & __GFP_FS) || 503 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 504 505 /* 506 * The page is mapped into the page tables of one or more 507 * processes. Try to unmap it here. 508 */ 509 if (page_mapped(page) && mapping) { 510 switch (try_to_unmap(page, 0)) { 511 case SWAP_FAIL: 512 goto activate_locked; 513 case SWAP_AGAIN: 514 goto keep_locked; 515 case SWAP_SUCCESS: 516 ; /* try to free the page below */ 517 } 518 } 519 520 if (PageDirty(page)) { 521 if (referenced) 522 goto keep_locked; 523 if (!may_enter_fs) 524 goto keep_locked; 525 if (!sc->may_writepage) 526 goto keep_locked; 527 528 /* Page is dirty, try to write it out here */ 529 switch(pageout(page, mapping)) { 530 case PAGE_KEEP: 531 goto keep_locked; 532 case PAGE_ACTIVATE: 533 goto activate_locked; 534 case PAGE_SUCCESS: 535 if (PageWriteback(page) || PageDirty(page)) 536 goto keep; 537 /* 538 * A synchronous write - probably a ramdisk. Go 539 * ahead and try to reclaim the page. 540 */ 541 if (TestSetPageLocked(page)) 542 goto keep; 543 if (PageDirty(page) || PageWriteback(page)) 544 goto keep_locked; 545 mapping = page_mapping(page); 546 case PAGE_CLEAN: 547 ; /* try to free the page below */ 548 } 549 } 550 551 /* 552 * If the page has buffers, try to free the buffer mappings 553 * associated with this page. If we succeed we try to free 554 * the page as well. 555 * 556 * We do this even if the page is PageDirty(). 557 * try_to_release_page() does not perform I/O, but it is 558 * possible for a page to have PageDirty set, but it is actually 559 * clean (all its buffers are clean). This happens if the 560 * buffers were written out directly, with submit_bh(). ext3 561 * will do this, as well as the blockdev mapping. 562 * try_to_release_page() will discover that cleanness and will 563 * drop the buffers and mark the page clean - it can be freed. 564 * 565 * Rarely, pages can have buffers and no ->mapping. These are 566 * the pages which were not successfully invalidated in 567 * truncate_complete_page(). We try to drop those buffers here 568 * and if that worked, and the page is no longer mapped into 569 * process address space (page_count == 1) it can be freed. 570 * Otherwise, leave the page on the LRU so it is swappable. 571 */ 572 if (PagePrivate(page)) { 573 if (!try_to_release_page(page, sc->gfp_mask)) 574 goto activate_locked; 575 if (!mapping && page_count(page) == 1) 576 goto free_it; 577 } 578 579 if (!mapping || !remove_mapping(mapping, page)) 580 goto keep_locked; 581 582free_it: 583 unlock_page(page); 584 nr_reclaimed++; 585 if (!pagevec_add(&freed_pvec, page)) 586 __pagevec_release_nonlru(&freed_pvec); 587 continue; 588 589activate_locked: 590 SetPageActive(page); 591 pgactivate++; 592keep_locked: 593 unlock_page(page); 594keep: 595 list_add(&page->lru, &ret_pages); 596 VM_BUG_ON(PageLRU(page)); 597 } 598 list_splice(&ret_pages, page_list); 599 if (pagevec_count(&freed_pvec)) 600 __pagevec_release_nonlru(&freed_pvec); 601 count_vm_events(PGACTIVATE, pgactivate); 602 return nr_reclaimed; 603} 604 605/* 606 * zone->lru_lock is heavily contended. Some of the functions that 607 * shrink the lists perform better by taking out a batch of pages 608 * and working on them outside the LRU lock. 609 * 610 * For pagecache intensive workloads, this function is the hottest 611 * spot in the kernel (apart from copy_*_user functions). 612 * 613 * Appropriate locks must be held before calling this function. 614 * 615 * @nr_to_scan: The number of pages to look through on the list. 616 * @src: The LRU list to pull pages off. 617 * @dst: The temp list to put pages on to. 618 * @scanned: The number of pages that were scanned. 619 * 620 * returns how many pages were moved onto *@dst. 621 */ 622static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 623 struct list_head *src, struct list_head *dst, 624 unsigned long *scanned) 625{ 626 unsigned long nr_taken = 0; 627 struct page *page; 628 unsigned long scan; 629 630 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 631 struct list_head *target; 632 page = lru_to_page(src); 633 prefetchw_prev_lru_page(page, src, flags); 634 635 VM_BUG_ON(!PageLRU(page)); 636 637 list_del(&page->lru); 638 target = src; 639 if (likely(get_page_unless_zero(page))) { 640 /* 641 * Be careful not to clear PageLRU until after we're 642 * sure the page is not being freed elsewhere -- the 643 * page release code relies on it. 644 */ 645 ClearPageLRU(page); 646 target = dst; 647 nr_taken++; 648 } /* else it is being freed elsewhere */ 649 650 list_add(&page->lru, target); 651 } 652 653 *scanned = scan; 654 return nr_taken; 655} 656 657/* 658 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 659 * of reclaimed pages 660 */ 661static unsigned long shrink_inactive_list(unsigned long max_scan, 662 struct zone *zone, struct scan_control *sc) 663{ 664 LIST_HEAD(page_list); 665 struct pagevec pvec; 666 unsigned long nr_scanned = 0; 667 unsigned long nr_reclaimed = 0; 668 669 pagevec_init(&pvec, 1); 670 671 lru_add_drain(); 672 spin_lock_irq(&zone->lru_lock); 673 do { 674 struct page *page; 675 unsigned long nr_taken; 676 unsigned long nr_scan; 677 unsigned long nr_freed; 678 679 nr_taken = isolate_lru_pages(sc->swap_cluster_max, 680 &zone->inactive_list, 681 &page_list, &nr_scan); 682 __mod_zone_page_state(zone, NR_INACTIVE, -nr_taken); 683 zone->pages_scanned += nr_scan; 684 spin_unlock_irq(&zone->lru_lock); 685 686 nr_scanned += nr_scan; 687 nr_freed = shrink_page_list(&page_list, sc); 688 nr_reclaimed += nr_freed; 689 local_irq_disable(); 690 if (current_is_kswapd()) { 691 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); 692 __count_vm_events(KSWAPD_STEAL, nr_freed); 693 } else 694 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); 695 __count_zone_vm_events(PGSTEAL, zone, nr_freed); 696 697 if (nr_taken == 0) 698 goto done; 699 700 spin_lock(&zone->lru_lock); 701 /* 702 * Put back any unfreeable pages. 703 */ 704 while (!list_empty(&page_list)) { 705 page = lru_to_page(&page_list); 706 VM_BUG_ON(PageLRU(page)); 707 SetPageLRU(page); 708 list_del(&page->lru); 709 if (PageActive(page)) 710 add_page_to_active_list(zone, page); 711 else 712 add_page_to_inactive_list(zone, page); 713 if (!pagevec_add(&pvec, page)) { 714 spin_unlock_irq(&zone->lru_lock); 715 __pagevec_release(&pvec); 716 spin_lock_irq(&zone->lru_lock); 717 } 718 } 719 } while (nr_scanned < max_scan); 720 spin_unlock(&zone->lru_lock); 721done: 722 local_irq_enable(); 723 pagevec_release(&pvec); 724 return nr_reclaimed; 725} 726 727/* 728 * We are about to scan this zone at a certain priority level. If that priority 729 * level is smaller (ie: more urgent) than the previous priority, then note 730 * that priority level within the zone. This is done so that when the next 731 * process comes in to scan this zone, it will immediately start out at this 732 * priority level rather than having to build up its own scanning priority. 733 * Here, this priority affects only the reclaim-mapped threshold. 734 */ 735static inline void note_zone_scanning_priority(struct zone *zone, int priority) 736{ 737 if (priority < zone->prev_priority) 738 zone->prev_priority = priority; 739} 740 741static inline int zone_is_near_oom(struct zone *zone) 742{ 743 return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE) 744 + zone_page_state(zone, NR_INACTIVE))*3; 745} 746 747/* 748 * This moves pages from the active list to the inactive list. 749 * 750 * We move them the other way if the page is referenced by one or more 751 * processes, from rmap. 752 * 753 * If the pages are mostly unmapped, the processing is fast and it is 754 * appropriate to hold zone->lru_lock across the whole operation. But if 755 * the pages are mapped, the processing is slow (page_referenced()) so we 756 * should drop zone->lru_lock around each page. It's impossible to balance 757 * this, so instead we remove the pages from the LRU while processing them. 758 * It is safe to rely on PG_active against the non-LRU pages in here because 759 * nobody will play with that bit on a non-LRU page. 760 * 761 * The downside is that we have to touch page->_count against each page. 762 * But we had to alter page->flags anyway. 763 */ 764static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 765 struct scan_control *sc, int priority) 766{ 767 unsigned long pgmoved; 768 int pgdeactivate = 0; 769 unsigned long pgscanned; 770 LIST_HEAD(l_hold); /* The pages which were snipped off */ 771 LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ 772 LIST_HEAD(l_active); /* Pages to go onto the active_list */ 773 struct page *page; 774 struct pagevec pvec; 775 int reclaim_mapped = 0; 776 777 if (sc->may_swap) { 778 long mapped_ratio; 779 long distress; 780 long swap_tendency; 781 782 if (zone_is_near_oom(zone)) 783 goto force_reclaim_mapped; 784 785 /* 786 * `distress' is a measure of how much trouble we're having 787 * reclaiming pages. 0 -> no problems. 100 -> great trouble. 788 */ 789 distress = 100 >> min(zone->prev_priority, priority); 790 791 /* 792 * The point of this algorithm is to decide when to start 793 * reclaiming mapped memory instead of just pagecache. Work out 794 * how much memory 795 * is mapped. 796 */ 797 mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + 798 global_page_state(NR_ANON_PAGES)) * 100) / 799 vm_total_pages; 800 801 /* 802 * Now decide how much we really want to unmap some pages. The 803 * mapped ratio is downgraded - just because there's a lot of 804 * mapped memory doesn't necessarily mean that page reclaim 805 * isn't succeeding. 806 * 807 * The distress ratio is important - we don't want to start 808 * going oom. 809 * 810 * A 100% value of vm_swappiness overrides this algorithm 811 * altogether. 812 */ 813 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; 814 815 /* 816 * Now use this metric to decide whether to start moving mapped 817 * memory onto the inactive list. 818 */ 819 if (swap_tendency >= 100) 820force_reclaim_mapped: 821 reclaim_mapped = 1; 822 } 823 824 lru_add_drain(); 825 spin_lock_irq(&zone->lru_lock); 826 pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, 827 &l_hold, &pgscanned); 828 zone->pages_scanned += pgscanned; 829 __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); 830 spin_unlock_irq(&zone->lru_lock); 831 832 while (!list_empty(&l_hold)) { 833 cond_resched(); 834 page = lru_to_page(&l_hold); 835 list_del(&page->lru); 836 if (page_mapped(page)) { 837 if (!reclaim_mapped || 838 (total_swap_pages == 0 && PageAnon(page)) || 839 page_referenced(page, 0)) { 840 list_add(&page->lru, &l_active); 841 continue; 842 } 843 } 844 list_add(&page->lru, &l_inactive); 845 } 846 847 pagevec_init(&pvec, 1); 848 pgmoved = 0; 849 spin_lock_irq(&zone->lru_lock); 850 while (!list_empty(&l_inactive)) { 851 page = lru_to_page(&l_inactive); 852 prefetchw_prev_lru_page(page, &l_inactive, flags); 853 VM_BUG_ON(PageLRU(page)); 854 SetPageLRU(page); 855 VM_BUG_ON(!PageActive(page)); 856 ClearPageActive(page); 857 858 list_move(&page->lru, &zone->inactive_list); 859 pgmoved++; 860 if (!pagevec_add(&pvec, page)) { 861 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); 862 spin_unlock_irq(&zone->lru_lock); 863 pgdeactivate += pgmoved; 864 pgmoved = 0; 865 if (buffer_heads_over_limit) 866 pagevec_strip(&pvec); 867 __pagevec_release(&pvec); 868 spin_lock_irq(&zone->lru_lock); 869 } 870 } 871 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); 872 pgdeactivate += pgmoved; 873 if (buffer_heads_over_limit) { 874 spin_unlock_irq(&zone->lru_lock); 875 pagevec_strip(&pvec); 876 spin_lock_irq(&zone->lru_lock); 877 } 878 879 pgmoved = 0; 880 while (!list_empty(&l_active)) { 881 page = lru_to_page(&l_active); 882 prefetchw_prev_lru_page(page, &l_active, flags); 883 VM_BUG_ON(PageLRU(page)); 884 SetPageLRU(page); 885 VM_BUG_ON(!PageActive(page)); 886 list_move(&page->lru, &zone->active_list); 887 pgmoved++; 888 if (!pagevec_add(&pvec, page)) { 889 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); 890 pgmoved = 0; 891 spin_unlock_irq(&zone->lru_lock); 892 __pagevec_release(&pvec); 893 spin_lock_irq(&zone->lru_lock); 894 } 895 } 896 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); 897 898 __count_zone_vm_events(PGREFILL, zone, pgscanned); 899 __count_vm_events(PGDEACTIVATE, pgdeactivate); 900 spin_unlock_irq(&zone->lru_lock); 901 902 pagevec_release(&pvec); 903} 904 905/* 906 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 907 */ 908static unsigned long shrink_zone(int priority, struct zone *zone, 909 struct scan_control *sc) 910{ 911 unsigned long nr_active; 912 unsigned long nr_inactive; 913 unsigned long nr_to_scan; 914 unsigned long nr_reclaimed = 0; 915 916 atomic_inc(&zone->reclaim_in_progress); 917 918 /* 919 * Add one to `nr_to_scan' just to make sure that the kernel will 920 * slowly sift through the active list. 921 */ 922 zone->nr_scan_active += 923 (zone_page_state(zone, NR_ACTIVE) >> priority) + 1; 924 nr_active = zone->nr_scan_active; 925 if (nr_active >= sc->swap_cluster_max) 926 zone->nr_scan_active = 0; 927 else 928 nr_active = 0; 929 930 zone->nr_scan_inactive += 931 (zone_page_state(zone, NR_INACTIVE) >> priority) + 1; 932 nr_inactive = zone->nr_scan_inactive; 933 if (nr_inactive >= sc->swap_cluster_max) 934 zone->nr_scan_inactive = 0; 935 else 936 nr_inactive = 0; 937 938 while (nr_active || nr_inactive) { 939 if (nr_active) { 940 nr_to_scan = min(nr_active, 941 (unsigned long)sc->swap_cluster_max); 942 nr_active -= nr_to_scan; 943 shrink_active_list(nr_to_scan, zone, sc, priority); 944 } 945 946 if (nr_inactive) { 947 nr_to_scan = min(nr_inactive, 948 (unsigned long)sc->swap_cluster_max); 949 nr_inactive -= nr_to_scan; 950 nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, 951 sc); 952 } 953 } 954 955 throttle_vm_writeout(sc->gfp_mask); 956 957 atomic_dec(&zone->reclaim_in_progress); 958 return nr_reclaimed; 959} 960 961/* 962 * This is the direct reclaim path, for page-allocating processes. We only 963 * try to reclaim pages from zones which will satisfy the caller's allocation 964 * request. 965 * 966 * We reclaim from a zone even if that zone is over pages_high. Because: 967 * a) The caller may be trying to free *extra* pages to satisfy a higher-order 968 * allocation or 969 * b) The zones may be over pages_high but they must go *over* pages_high to 970 * satisfy the `incremental min' zone defense algorithm. 971 * 972 * Returns the number of reclaimed pages. 973 * 974 * If a zone is deemed to be full of pinned pages then just give it a light 975 * scan then give up on it. 976 */ 977static unsigned long shrink_zones(int priority, struct zone **zones, 978 struct scan_control *sc) 979{ 980 unsigned long nr_reclaimed = 0; 981 int i; 982 983 sc->all_unreclaimable = 1; 984 for (i = 0; zones[i] != NULL; i++) { 985 struct zone *zone = zones[i]; 986 987 if (!populated_zone(zone)) 988 continue; 989 990 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 991 continue; 992 993 note_zone_scanning_priority(zone, priority); 994 995 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 996 continue; /* Let kswapd poll it */ 997 998 sc->all_unreclaimable = 0; 999 1000 nr_reclaimed += shrink_zone(priority, zone, sc); 1001 } 1002 return nr_reclaimed; 1003} 1004 1005/* 1006 * This is the main entry point to direct page reclaim. 1007 * 1008 * If a full scan of the inactive list fails to free enough memory then we 1009 * are "out of memory" and something needs to be killed. 1010 * 1011 * If the caller is !__GFP_FS then the probability of a failure is reasonably 1012 * high - the zone may be full of dirty or under-writeback pages, which this 1013 * caller can't do much about. We kick pdflush and take explicit naps in the 1014 * hope that some of these pages can be written. But if the allocating task 1015 * holds filesystem locks which prevent writeout this might not work, and the 1016 * allocation attempt will fail. 1017 */ 1018unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) 1019{ 1020 int priority; 1021 int ret = 0; 1022 unsigned long total_scanned = 0; 1023 unsigned long nr_reclaimed = 0; 1024 struct reclaim_state *reclaim_state = current->reclaim_state; 1025 unsigned long lru_pages = 0; 1026 int i; 1027 struct scan_control sc = { 1028 .gfp_mask = gfp_mask, 1029 .may_writepage = !laptop_mode, 1030 .swap_cluster_max = SWAP_CLUSTER_MAX, 1031 .may_swap = 1, 1032 .swappiness = vm_swappiness, 1033 }; 1034 1035 count_vm_event(ALLOCSTALL); 1036 1037 for (i = 0; zones[i] != NULL; i++) { 1038 struct zone *zone = zones[i]; 1039 1040 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1041 continue; 1042 1043 lru_pages += zone_page_state(zone, NR_ACTIVE) 1044 + zone_page_state(zone, NR_INACTIVE); 1045 } 1046 1047 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1048 sc.nr_scanned = 0; 1049 if (!priority) 1050 disable_swap_token(); 1051 nr_reclaimed += shrink_zones(priority, zones, &sc); 1052 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); 1053 if (reclaim_state) { 1054 nr_reclaimed += reclaim_state->reclaimed_slab; 1055 reclaim_state->reclaimed_slab = 0; 1056 } 1057 total_scanned += sc.nr_scanned; 1058 if (nr_reclaimed >= sc.swap_cluster_max) { 1059 ret = 1; 1060 goto out; 1061 } 1062 1063 /* 1064 * Try to write back as many pages as we just scanned. This 1065 * tends to cause slow streaming writers to write data to the 1066 * disk smoothly, at the dirtying rate, which is nice. But 1067 * that's undesirable in laptop mode, where we *want* lumpy 1068 * writeout. So in laptop mode, write out the whole world. 1069 */ 1070 if (total_scanned > sc.swap_cluster_max + 1071 sc.swap_cluster_max / 2) { 1072 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1073 sc.may_writepage = 1; 1074 } 1075 1076 /* Take a nap, wait for some writeback to complete */ 1077 if (sc.nr_scanned && priority < DEF_PRIORITY - 2) 1078 congestion_wait(WRITE, HZ/10); 1079 } 1080 /* top priority shrink_caches still had more to do? don't OOM, then */ 1081 if (!sc.all_unreclaimable) 1082 ret = 1; 1083out: 1084 /* 1085 * Now that we've scanned all the zones at this priority level, note 1086 * that level within the zone so that the next thread which performs 1087 * scanning of this zone will immediately start out at this priority 1088 * level. This affects only the decision whether or not to bring 1089 * mapped pages onto the inactive list. 1090 */ 1091 if (priority < 0) 1092 priority = 0; 1093 for (i = 0; zones[i] != 0; i++) { 1094 struct zone *zone = zones[i]; 1095 1096 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1097 continue; 1098 1099 zone->prev_priority = priority; 1100 } 1101 return ret; 1102} 1103 1104/* 1105 * For kswapd, balance_pgdat() will work across all this node's zones until 1106 * they are all at pages_high. 1107 * 1108 * Returns the number of pages which were actually freed. 1109 * 1110 * There is special handling here for zones which are full of pinned pages. 1111 * This can happen if the pages are all mlocked, or if they are all used by 1112 * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. 1113 * What we do is to detect the case where all pages in the zone have been 1114 * scanned twice and there has been zero successful reclaim. Mark the zone as 1115 * dead and from now on, only perform a short scan. Basically we're polling 1116 * the zone for when the problem goes away. 1117 * 1118 * kswapd scans the zones in the highmem->normal->dma direction. It skips 1119 * zones which have free_pages > pages_high, but once a zone is found to have 1120 * free_pages <= pages_high, we scan that zone and the lower zones regardless 1121 * of the number of free pages in the lower zones. This interoperates with 1122 * the page allocator fallback scheme to ensure that aging of pages is balanced 1123 * across the zones. 1124 */ 1125static unsigned long balance_pgdat(pg_data_t *pgdat, int order) 1126{ 1127 int all_zones_ok; 1128 int priority; 1129 int i; 1130 unsigned long total_scanned; 1131 unsigned long nr_reclaimed; 1132 struct reclaim_state *reclaim_state = current->reclaim_state; 1133 struct scan_control sc = { 1134 .gfp_mask = GFP_KERNEL, 1135 .may_swap = 1, 1136 .swap_cluster_max = SWAP_CLUSTER_MAX, 1137 .swappiness = vm_swappiness, 1138 }; 1139 /* 1140 * temp_priority is used to remember the scanning priority at which 1141 * this zone was successfully refilled to free_pages == pages_high. 1142 */ 1143 int temp_priority[MAX_NR_ZONES]; 1144 1145loop_again: 1146 total_scanned = 0; 1147 nr_reclaimed = 0; 1148 sc.may_writepage = !laptop_mode; 1149 count_vm_event(PAGEOUTRUN); 1150 1151 for (i = 0; i < pgdat->nr_zones; i++) 1152 temp_priority[i] = DEF_PRIORITY; 1153 1154 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1155 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 1156 unsigned long lru_pages = 0; 1157 1158 /* The swap token gets in the way of swapout... */ 1159 if (!priority) 1160 disable_swap_token(); 1161 1162 all_zones_ok = 1; 1163 1164 /* 1165 * Scan in the highmem->dma direction for the highest 1166 * zone which needs scanning 1167 */ 1168 for (i = pgdat->nr_zones - 1; i >= 0; i--) { 1169 struct zone *zone = pgdat->node_zones + i; 1170 1171 if (!populated_zone(zone)) 1172 continue; 1173 1174 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1175 continue; 1176 1177 if (!zone_watermark_ok(zone, order, zone->pages_high, 1178 0, 0)) { 1179 end_zone = i; 1180 break; 1181 } 1182 } 1183 if (i < 0) 1184 goto out; 1185 1186 for (i = 0; i <= end_zone; i++) { 1187 struct zone *zone = pgdat->node_zones + i; 1188 1189 lru_pages += zone_page_state(zone, NR_ACTIVE) 1190 + zone_page_state(zone, NR_INACTIVE); 1191 } 1192 1193 /* 1194 * Now scan the zone in the dma->highmem direction, stopping 1195 * at the last zone which needs scanning. 1196 * 1197 * We do this because the page allocator works in the opposite 1198 * direction. This prevents the page allocator from allocating 1199 * pages behind kswapd's direction of progress, which would 1200 * cause too much scanning of the lower zones. 1201 */ 1202 for (i = 0; i <= end_zone; i++) { 1203 struct zone *zone = pgdat->node_zones + i; 1204 int nr_slab; 1205 1206 if (!populated_zone(zone)) 1207 continue; 1208 1209 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1210 continue; 1211 1212 if (!zone_watermark_ok(zone, order, zone->pages_high, 1213 end_zone, 0)) 1214 all_zones_ok = 0; 1215 temp_priority[i] = priority; 1216 sc.nr_scanned = 0; 1217 note_zone_scanning_priority(zone, priority); 1218 nr_reclaimed += shrink_zone(priority, zone, &sc); 1219 reclaim_state->reclaimed_slab = 0; 1220 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1221 lru_pages); 1222 nr_reclaimed += reclaim_state->reclaimed_slab; 1223 total_scanned += sc.nr_scanned; 1224 if (zone->all_unreclaimable) 1225 continue; 1226 if (nr_slab == 0 && zone->pages_scanned >= 1227 (zone_page_state(zone, NR_ACTIVE) 1228 + zone_page_state(zone, NR_INACTIVE)) * 6) 1229 zone->all_unreclaimable = 1; 1230 /* 1231 * If we've done a decent amount of scanning and 1232 * the reclaim ratio is low, start doing writepage 1233 * even in laptop mode 1234 */ 1235 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 1236 total_scanned > nr_reclaimed + nr_reclaimed / 2) 1237 sc.may_writepage = 1; 1238 } 1239 if (all_zones_ok) 1240 break; /* kswapd: all done */ 1241 /* 1242 * OK, kswapd is getting into trouble. Take a nap, then take 1243 * another pass across the zones. 1244 */ 1245 if (total_scanned && priority < DEF_PRIORITY - 2) 1246 congestion_wait(WRITE, HZ/10); 1247 1248 /* 1249 * We do this so kswapd doesn't build up large priorities for 1250 * example when it is freeing in parallel with allocators. It 1251 * matches the direct reclaim path behaviour in terms of impact 1252 * on zone->*_priority. 1253 */ 1254 if (nr_reclaimed >= SWAP_CLUSTER_MAX) 1255 break; 1256 } 1257out: 1258 /* 1259 * Note within each zone the priority level at which this zone was 1260 * brought into a happy state. So that the next thread which scans this 1261 * zone will start out at that priority level. 1262 */ 1263 for (i = 0; i < pgdat->nr_zones; i++) { 1264 struct zone *zone = pgdat->node_zones + i; 1265 1266 zone->prev_priority = temp_priority[i]; 1267 } 1268 if (!all_zones_ok) { 1269 cond_resched(); 1270 1271 try_to_freeze(); 1272 1273 goto loop_again; 1274 } 1275 1276 return nr_reclaimed; 1277} 1278 1279/* 1280 * The background pageout daemon, started as a kernel thread 1281 * from the init process. 1282 * 1283 * This basically trickles out pages so that we have _some_ 1284 * free memory available even if there is no other activity 1285 * that frees anything up. This is needed for things like routing 1286 * etc, where we otherwise might have all activity going on in 1287 * asynchronous contexts that cannot page things out. 1288 * 1289 * If there are applications that are active memory-allocators 1290 * (most normal use), this basically shouldn't matter. 1291 */ 1292static int kswapd(void *p) 1293{ 1294 unsigned long order; 1295 pg_data_t *pgdat = (pg_data_t*)p; 1296 struct task_struct *tsk = current; 1297 DEFINE_WAIT(wait); 1298 struct reclaim_state reclaim_state = { 1299 .reclaimed_slab = 0, 1300 }; 1301 cpumask_t cpumask; 1302 1303 cpumask = node_to_cpumask(pgdat->node_id); 1304 if (!cpus_empty(cpumask)) 1305 set_cpus_allowed(tsk, cpumask); 1306 current->reclaim_state = &reclaim_state; 1307 1308 /* 1309 * Tell the memory management that we're a "memory allocator", 1310 * and that if we need more memory we should get access to it 1311 * regardless (see "__alloc_pages()"). "kswapd" should 1312 * never get caught in the normal page freeing logic. 1313 * 1314 * (Kswapd normally doesn't need memory anyway, but sometimes 1315 * you need a small amount of memory in order to be able to 1316 * page out something else, and this flag essentially protects 1317 * us from recursively trying to free more memory as we're 1318 * trying to free the first piece of memory in the first place). 1319 */ 1320 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 1321 1322 order = 0; 1323 for ( ; ; ) { 1324 unsigned long new_order; 1325 1326 try_to_freeze(); 1327 1328 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 1329 new_order = pgdat->kswapd_max_order; 1330 pgdat->kswapd_max_order = 0; 1331 if (order < new_order) { 1332 /* 1333 * Don't sleep if someone wants a larger 'order' 1334 * allocation 1335 */ 1336 order = new_order; 1337 } else { 1338 schedule(); 1339 order = pgdat->kswapd_max_order; 1340 } 1341 finish_wait(&pgdat->kswapd_wait, &wait); 1342 1343 balance_pgdat(pgdat, order); 1344 } 1345 return 0; 1346} 1347 1348/* 1349 * A zone is low on free memory, so wake its kswapd task to service it. 1350 */ 1351void wakeup_kswapd(struct zone *zone, int order) 1352{ 1353 pg_data_t *pgdat; 1354 1355 if (!populated_zone(zone)) 1356 return; 1357 1358 pgdat = zone->zone_pgdat; 1359 if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) 1360 return; 1361 if (pgdat->kswapd_max_order < order) 1362 pgdat->kswapd_max_order = order; 1363 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1364 return; 1365 if (!waitqueue_active(&pgdat->kswapd_wait)) 1366 return; 1367 wake_up_interruptible(&pgdat->kswapd_wait); 1368} 1369 1370#ifdef CONFIG_PM 1371/* 1372 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 1373 * from LRU lists system-wide, for given pass and priority, and returns the 1374 * number of reclaimed pages 1375 * 1376 * For pass > 3 we also try to shrink the LRU lists that contain a few pages 1377 */ 1378static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, 1379 int pass, struct scan_control *sc) 1380{ 1381 struct zone *zone; 1382 unsigned long nr_to_scan, ret = 0; 1383 1384 for_each_zone(zone) { 1385 1386 if (!populated_zone(zone)) 1387 continue; 1388 1389 if (zone->all_unreclaimable && prio != DEF_PRIORITY) 1390 continue; 1391 1392 /* For pass = 0 we don't shrink the active list */ 1393 if (pass > 0) { 1394 zone->nr_scan_active += 1395 (zone_page_state(zone, NR_ACTIVE) >> prio) + 1; 1396 if (zone->nr_scan_active >= nr_pages || pass > 3) { 1397 zone->nr_scan_active = 0; 1398 nr_to_scan = min(nr_pages, 1399 zone_page_state(zone, NR_ACTIVE)); 1400 shrink_active_list(nr_to_scan, zone, sc, prio); 1401 } 1402 } 1403 1404 zone->nr_scan_inactive += 1405 (zone_page_state(zone, NR_INACTIVE) >> prio) + 1; 1406 if (zone->nr_scan_inactive >= nr_pages || pass > 3) { 1407 zone->nr_scan_inactive = 0; 1408 nr_to_scan = min(nr_pages, 1409 zone_page_state(zone, NR_INACTIVE)); 1410 ret += shrink_inactive_list(nr_to_scan, zone, sc); 1411 if (ret >= nr_pages) 1412 return ret; 1413 } 1414 } 1415 1416 return ret; 1417} 1418 1419static unsigned long count_lru_pages(void) 1420{ 1421 return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE); 1422} 1423 1424/* 1425 * Try to free `nr_pages' of memory, system-wide, and return the number of 1426 * freed pages. 1427 * 1428 * Rather than trying to age LRUs the aim is to preserve the overall 1429 * LRU order by reclaiming preferentially 1430 * inactive > active > active referenced > active mapped 1431 */ 1432unsigned long shrink_all_memory(unsigned long nr_pages) 1433{ 1434 unsigned long lru_pages, nr_slab; 1435 unsigned long ret = 0; 1436 int pass; 1437 struct reclaim_state reclaim_state; 1438 struct scan_control sc = { 1439 .gfp_mask = GFP_KERNEL, 1440 .may_swap = 0, 1441 .swap_cluster_max = nr_pages, 1442 .may_writepage = 1, 1443 .swappiness = vm_swappiness, 1444 }; 1445 1446 current->reclaim_state = &reclaim_state; 1447 1448 lru_pages = count_lru_pages(); 1449 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); 1450 /* If slab caches are huge, it's better to hit them first */ 1451 while (nr_slab >= lru_pages) { 1452 reclaim_state.reclaimed_slab = 0; 1453 shrink_slab(nr_pages, sc.gfp_mask, lru_pages); 1454 if (!reclaim_state.reclaimed_slab) 1455 break; 1456 1457 ret += reclaim_state.reclaimed_slab; 1458 if (ret >= nr_pages) 1459 goto out; 1460 1461 nr_slab -= reclaim_state.reclaimed_slab; 1462 } 1463 1464 /* 1465 * We try to shrink LRUs in 5 passes: 1466 * 0 = Reclaim from inactive_list only 1467 * 1 = Reclaim from active list but don't reclaim mapped 1468 * 2 = 2nd pass of type 1 1469 * 3 = Reclaim mapped (normal reclaim) 1470 * 4 = 2nd pass of type 3 1471 */ 1472 for (pass = 0; pass < 5; pass++) { 1473 int prio; 1474 1475 /* Force reclaiming mapped pages in the passes #3 and #4 */ 1476 if (pass > 2) { 1477 sc.may_swap = 1; 1478 sc.swappiness = 100; 1479 } 1480 1481 for (prio = DEF_PRIORITY; prio >= 0; prio--) { 1482 unsigned long nr_to_scan = nr_pages - ret; 1483 1484 sc.nr_scanned = 0; 1485 ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); 1486 if (ret >= nr_pages) 1487 goto out; 1488 1489 reclaim_state.reclaimed_slab = 0; 1490 shrink_slab(sc.nr_scanned, sc.gfp_mask, 1491 count_lru_pages()); 1492 ret += reclaim_state.reclaimed_slab; 1493 if (ret >= nr_pages) 1494 goto out; 1495 1496 if (sc.nr_scanned && prio < DEF_PRIORITY - 2) 1497 congestion_wait(WRITE, HZ / 10); 1498 } 1499 } 1500 1501 /* 1502 * If ret = 0, we could not shrink LRUs, but there may be something 1503 * in slab caches 1504 */ 1505 if (!ret) { 1506 do { 1507 reclaim_state.reclaimed_slab = 0; 1508 shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages()); 1509 ret += reclaim_state.reclaimed_slab; 1510 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); 1511 } 1512 1513out: 1514 current->reclaim_state = NULL; 1515 1516 return ret; 1517} 1518#endif 1519 1520/* It's optimal to keep kswapds on the same CPUs as their memory, but 1521 not required for correctness. So if the last cpu in a node goes 1522 away, we get changed to run anywhere: as the first one comes back, 1523 restore their cpu bindings. */ 1524static int __devinit cpu_callback(struct notifier_block *nfb, 1525 unsigned long action, void *hcpu) 1526{ 1527 pg_data_t *pgdat; 1528 cpumask_t mask; 1529 1530 if (action == CPU_ONLINE) { 1531 for_each_online_pgdat(pgdat) { 1532 mask = node_to_cpumask(pgdat->node_id); 1533 if (any_online_cpu(mask) != NR_CPUS) 1534 /* One of our CPUs online: restore mask */ 1535 set_cpus_allowed(pgdat->kswapd, mask); 1536 } 1537 } 1538 return NOTIFY_OK; 1539} 1540 1541/* 1542 * This kswapd start function will be called by init and node-hot-add. 1543 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. 1544 */ 1545int kswapd_run(int nid) 1546{ 1547 pg_data_t *pgdat = NODE_DATA(nid); 1548 int ret = 0; 1549 1550 if (pgdat->kswapd) 1551 return 0; 1552 1553 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); 1554 if (IS_ERR(pgdat->kswapd)) { 1555 /* failure at boot is fatal */ 1556 BUG_ON(system_state == SYSTEM_BOOTING); 1557 printk("Failed to start kswapd on node %d\n",nid); 1558 ret = -1; 1559 } 1560 return ret; 1561} 1562 1563static int __init kswapd_init(void) 1564{ 1565 int nid; 1566 1567 swap_setup(); 1568 for_each_online_node(nid) 1569 kswapd_run(nid); 1570 hotcpu_notifier(cpu_callback, 0); 1571 return 0; 1572} 1573 1574module_init(kswapd_init) 1575 1576#ifdef CONFIG_NUMA 1577/* 1578 * Zone reclaim mode 1579 * 1580 * If non-zero call zone_reclaim when the number of free pages falls below 1581 * the watermarks. 1582 */ 1583int zone_reclaim_mode __read_mostly; 1584 1585#define RECLAIM_OFF 0 1586#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ 1587#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ 1588#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ 1589 1590/* 1591 * Priority for ZONE_RECLAIM. This determines the fraction of pages 1592 * of a node considered for each zone_reclaim. 4 scans 1/16th of 1593 * a zone. 1594 */ 1595#define ZONE_RECLAIM_PRIORITY 4 1596 1597/* 1598 * Percentage of pages in a zone that must be unmapped for zone_reclaim to 1599 * occur. 1600 */ 1601int sysctl_min_unmapped_ratio = 1; 1602 1603/* 1604 * If the number of slab pages in a zone grows beyond this percentage then 1605 * slab reclaim needs to occur. 1606 */ 1607int sysctl_min_slab_ratio = 5; 1608 1609/* 1610 * Try to free up some pages from this zone through reclaim. 1611 */ 1612static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1613{ 1614 /* Minimum pages needed in order to stay on node */ 1615 const unsigned long nr_pages = 1 << order; 1616 struct task_struct *p = current; 1617 struct reclaim_state reclaim_state; 1618 int priority; 1619 unsigned long nr_reclaimed = 0; 1620 struct scan_control sc = { 1621 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 1622 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), 1623 .swap_cluster_max = max_t(unsigned long, nr_pages, 1624 SWAP_CLUSTER_MAX), 1625 .gfp_mask = gfp_mask, 1626 .swappiness = vm_swappiness, 1627 }; 1628 unsigned long slab_reclaimable; 1629 1630 disable_swap_token(); 1631 cond_resched(); 1632 /* 1633 * We need to be able to allocate from the reserves for RECLAIM_SWAP 1634 * and we also need to be able to write out pages for RECLAIM_WRITE 1635 * and RECLAIM_SWAP. 1636 */ 1637 p->flags |= PF_MEMALLOC | PF_SWAPWRITE; 1638 reclaim_state.reclaimed_slab = 0; 1639 p->reclaim_state = &reclaim_state; 1640 1641 if (zone_page_state(zone, NR_FILE_PAGES) - 1642 zone_page_state(zone, NR_FILE_MAPPED) > 1643 zone->min_unmapped_pages) { 1644 /* 1645 * Free memory by calling shrink zone with increasing 1646 * priorities until we have enough memory freed. 1647 */ 1648 priority = ZONE_RECLAIM_PRIORITY; 1649 do { 1650 note_zone_scanning_priority(zone, priority); 1651 nr_reclaimed += shrink_zone(priority, zone, &sc); 1652 priority--; 1653 } while (priority >= 0 && nr_reclaimed < nr_pages); 1654 } 1655 1656 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 1657 if (slab_reclaimable > zone->min_slab_pages) { 1658 /* 1659 * shrink_slab() does not currently allow us to determine how 1660 * many pages were freed in this zone. So we take the current 1661 * number of slab pages and shake the slab until it is reduced 1662 * by the same nr_pages that we used for reclaiming unmapped 1663 * pages. 1664 * 1665 * Note that shrink_slab will free memory on all zones and may 1666 * take a long time. 1667 */ 1668 while (shrink_slab(sc.nr_scanned, gfp_mask, order) && 1669 zone_page_state(zone, NR_SLAB_RECLAIMABLE) > 1670 slab_reclaimable - nr_pages) 1671 ; 1672 1673 /* 1674 * Update nr_reclaimed by the number of slab pages we 1675 * reclaimed from this zone. 1676 */ 1677 nr_reclaimed += slab_reclaimable - 1678 zone_page_state(zone, NR_SLAB_RECLAIMABLE); 1679 } 1680 1681 p->reclaim_state = NULL; 1682 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 1683 return nr_reclaimed >= nr_pages; 1684} 1685 1686int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1687{ 1688 cpumask_t mask; 1689 int node_id; 1690 1691 /* 1692 * Zone reclaim reclaims unmapped file backed pages and 1693 * slab pages if we are over the defined limits. 1694 * 1695 * A small portion of unmapped file backed pages is needed for 1696 * file I/O otherwise pages read by file I/O will be immediately 1697 * thrown out if the zone is overallocated. So we do not reclaim 1698 * if less than a specified percentage of the zone is used by 1699 * unmapped file backed pages. 1700 */ 1701 if (zone_page_state(zone, NR_FILE_PAGES) - 1702 zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages 1703 && zone_page_state(zone, NR_SLAB_RECLAIMABLE) 1704 <= zone->min_slab_pages) 1705 return 0; 1706 1707 /* 1708 * Avoid concurrent zone reclaims, do not reclaim in a zone that does 1709 * not have reclaimable pages and if we should not delay the allocation 1710 * then do not scan. 1711 */ 1712 if (!(gfp_mask & __GFP_WAIT) || 1713 zone->all_unreclaimable || 1714 atomic_read(&zone->reclaim_in_progress) > 0 || 1715 (current->flags & PF_MEMALLOC)) 1716 return 0; 1717 1718 /* 1719 * Only run zone reclaim on the local zone or on zones that do not 1720 * have associated processors. This will favor the local processor 1721 * over remote processors and spread off node memory allocations 1722 * as wide as possible. 1723 */ 1724 node_id = zone_to_nid(zone); 1725 mask = node_to_cpumask(node_id); 1726 if (!cpus_empty(mask) && node_id != numa_node_id()) 1727 return 0; 1728 return __zone_reclaim(zone, gfp_mask, order); 1729} 1730#endif 1731