swap.c revision f8891e5e1f93a128c3900f82035e8541357896a7
1/* 2 * linux/mm/swap.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 */ 6 7/* 8 * This file contains the default values for the opereation of the 9 * Linux VM subsystem. Fine-tuning documentation can be found in 10 * Documentation/sysctl/vm.txt. 11 * Started 18.12.91 12 * Swap aging added 23.2.95, Stephen Tweedie. 13 * Buffermem limits added 12.3.98, Rik van Riel. 14 */ 15 16#include <linux/mm.h> 17#include <linux/sched.h> 18#include <linux/kernel_stat.h> 19#include <linux/swap.h> 20#include <linux/mman.h> 21#include <linux/pagemap.h> 22#include <linux/pagevec.h> 23#include <linux/init.h> 24#include <linux/module.h> 25#include <linux/mm_inline.h> 26#include <linux/buffer_head.h> /* for try_to_release_page() */ 27#include <linux/module.h> 28#include <linux/percpu_counter.h> 29#include <linux/percpu.h> 30#include <linux/cpu.h> 31#include <linux/notifier.h> 32#include <linux/init.h> 33 34/* How many pages do we try to swap or page in/out together? */ 35int page_cluster; 36 37static void put_compound_page(struct page *page) 38{ 39 page = (struct page *)page_private(page); 40 if (put_page_testzero(page)) { 41 void (*dtor)(struct page *page); 42 43 dtor = (void (*)(struct page *))page[1].lru.next; 44 (*dtor)(page); 45 } 46} 47 48void put_page(struct page *page) 49{ 50 if (unlikely(PageCompound(page))) 51 put_compound_page(page); 52 else if (put_page_testzero(page)) 53 __page_cache_release(page); 54} 55EXPORT_SYMBOL(put_page); 56 57/* 58 * Writeback is about to end against a page which has been marked for immediate 59 * reclaim. If it still appears to be reclaimable, move it to the tail of the 60 * inactive list. The page still has PageWriteback set, which will pin it. 61 * 62 * We don't expect many pages to come through here, so don't bother batching 63 * things up. 64 * 65 * To avoid placing the page at the tail of the LRU while PG_writeback is still 66 * set, this function will clear PG_writeback before performing the page 67 * motion. Do that inside the lru lock because once PG_writeback is cleared 68 * we may not touch the page. 69 * 70 * Returns zero if it cleared PG_writeback. 71 */ 72int rotate_reclaimable_page(struct page *page) 73{ 74 struct zone *zone; 75 unsigned long flags; 76 77 if (PageLocked(page)) 78 return 1; 79 if (PageDirty(page)) 80 return 1; 81 if (PageActive(page)) 82 return 1; 83 if (!PageLRU(page)) 84 return 1; 85 86 zone = page_zone(page); 87 spin_lock_irqsave(&zone->lru_lock, flags); 88 if (PageLRU(page) && !PageActive(page)) { 89 list_move_tail(&page->lru, &zone->inactive_list); 90 __count_vm_event(PGROTATED); 91 } 92 if (!test_clear_page_writeback(page)) 93 BUG(); 94 spin_unlock_irqrestore(&zone->lru_lock, flags); 95 return 0; 96} 97 98/* 99 * FIXME: speed this up? 100 */ 101void fastcall activate_page(struct page *page) 102{ 103 struct zone *zone = page_zone(page); 104 105 spin_lock_irq(&zone->lru_lock); 106 if (PageLRU(page) && !PageActive(page)) { 107 del_page_from_inactive_list(zone, page); 108 SetPageActive(page); 109 add_page_to_active_list(zone, page); 110 __count_vm_event(PGACTIVATE); 111 } 112 spin_unlock_irq(&zone->lru_lock); 113} 114 115/* 116 * Mark a page as having seen activity. 117 * 118 * inactive,unreferenced -> inactive,referenced 119 * inactive,referenced -> active,unreferenced 120 * active,unreferenced -> active,referenced 121 */ 122void fastcall mark_page_accessed(struct page *page) 123{ 124 if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { 125 activate_page(page); 126 ClearPageReferenced(page); 127 } else if (!PageReferenced(page)) { 128 SetPageReferenced(page); 129 } 130} 131 132EXPORT_SYMBOL(mark_page_accessed); 133 134/** 135 * lru_cache_add: add a page to the page lists 136 * @page: the page to add 137 */ 138static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; 139static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; 140 141void fastcall lru_cache_add(struct page *page) 142{ 143 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); 144 145 page_cache_get(page); 146 if (!pagevec_add(pvec, page)) 147 __pagevec_lru_add(pvec); 148 put_cpu_var(lru_add_pvecs); 149} 150 151void fastcall lru_cache_add_active(struct page *page) 152{ 153 struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); 154 155 page_cache_get(page); 156 if (!pagevec_add(pvec, page)) 157 __pagevec_lru_add_active(pvec); 158 put_cpu_var(lru_add_active_pvecs); 159} 160 161static void __lru_add_drain(int cpu) 162{ 163 struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); 164 165 /* CPU is dead, so no locking needed. */ 166 if (pagevec_count(pvec)) 167 __pagevec_lru_add(pvec); 168 pvec = &per_cpu(lru_add_active_pvecs, cpu); 169 if (pagevec_count(pvec)) 170 __pagevec_lru_add_active(pvec); 171} 172 173void lru_add_drain(void) 174{ 175 __lru_add_drain(get_cpu()); 176 put_cpu(); 177} 178 179#ifdef CONFIG_NUMA 180static void lru_add_drain_per_cpu(void *dummy) 181{ 182 lru_add_drain(); 183} 184 185/* 186 * Returns 0 for success 187 */ 188int lru_add_drain_all(void) 189{ 190 return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL); 191} 192 193#else 194 195/* 196 * Returns 0 for success 197 */ 198int lru_add_drain_all(void) 199{ 200 lru_add_drain(); 201 return 0; 202} 203#endif 204 205/* 206 * This path almost never happens for VM activity - pages are normally 207 * freed via pagevecs. But it gets used by networking. 208 */ 209void fastcall __page_cache_release(struct page *page) 210{ 211 if (PageLRU(page)) { 212 unsigned long flags; 213 struct zone *zone = page_zone(page); 214 215 spin_lock_irqsave(&zone->lru_lock, flags); 216 BUG_ON(!PageLRU(page)); 217 __ClearPageLRU(page); 218 del_page_from_lru(zone, page); 219 spin_unlock_irqrestore(&zone->lru_lock, flags); 220 } 221 free_hot_page(page); 222} 223EXPORT_SYMBOL(__page_cache_release); 224 225/* 226 * Batched page_cache_release(). Decrement the reference count on all the 227 * passed pages. If it fell to zero then remove the page from the LRU and 228 * free it. 229 * 230 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it 231 * for the remainder of the operation. 232 * 233 * The locking in this function is against shrink_cache(): we recheck the 234 * page count inside the lock to see whether shrink_cache grabbed the page 235 * via the LRU. If it did, give up: shrink_cache will free it. 236 */ 237void release_pages(struct page **pages, int nr, int cold) 238{ 239 int i; 240 struct pagevec pages_to_free; 241 struct zone *zone = NULL; 242 243 pagevec_init(&pages_to_free, cold); 244 for (i = 0; i < nr; i++) { 245 struct page *page = pages[i]; 246 247 if (unlikely(PageCompound(page))) { 248 if (zone) { 249 spin_unlock_irq(&zone->lru_lock); 250 zone = NULL; 251 } 252 put_compound_page(page); 253 continue; 254 } 255 256 if (!put_page_testzero(page)) 257 continue; 258 259 if (PageLRU(page)) { 260 struct zone *pagezone = page_zone(page); 261 if (pagezone != zone) { 262 if (zone) 263 spin_unlock_irq(&zone->lru_lock); 264 zone = pagezone; 265 spin_lock_irq(&zone->lru_lock); 266 } 267 BUG_ON(!PageLRU(page)); 268 __ClearPageLRU(page); 269 del_page_from_lru(zone, page); 270 } 271 272 if (!pagevec_add(&pages_to_free, page)) { 273 if (zone) { 274 spin_unlock_irq(&zone->lru_lock); 275 zone = NULL; 276 } 277 __pagevec_free(&pages_to_free); 278 pagevec_reinit(&pages_to_free); 279 } 280 } 281 if (zone) 282 spin_unlock_irq(&zone->lru_lock); 283 284 pagevec_free(&pages_to_free); 285} 286 287/* 288 * The pages which we're about to release may be in the deferred lru-addition 289 * queues. That would prevent them from really being freed right now. That's 290 * OK from a correctness point of view but is inefficient - those pages may be 291 * cache-warm and we want to give them back to the page allocator ASAP. 292 * 293 * So __pagevec_release() will drain those queues here. __pagevec_lru_add() 294 * and __pagevec_lru_add_active() call release_pages() directly to avoid 295 * mutual recursion. 296 */ 297void __pagevec_release(struct pagevec *pvec) 298{ 299 lru_add_drain(); 300 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); 301 pagevec_reinit(pvec); 302} 303 304EXPORT_SYMBOL(__pagevec_release); 305 306/* 307 * pagevec_release() for pages which are known to not be on the LRU 308 * 309 * This function reinitialises the caller's pagevec. 310 */ 311void __pagevec_release_nonlru(struct pagevec *pvec) 312{ 313 int i; 314 struct pagevec pages_to_free; 315 316 pagevec_init(&pages_to_free, pvec->cold); 317 for (i = 0; i < pagevec_count(pvec); i++) { 318 struct page *page = pvec->pages[i]; 319 320 BUG_ON(PageLRU(page)); 321 if (put_page_testzero(page)) 322 pagevec_add(&pages_to_free, page); 323 } 324 pagevec_free(&pages_to_free); 325 pagevec_reinit(pvec); 326} 327 328/* 329 * Add the passed pages to the LRU, then drop the caller's refcount 330 * on them. Reinitialises the caller's pagevec. 331 */ 332void __pagevec_lru_add(struct pagevec *pvec) 333{ 334 int i; 335 struct zone *zone = NULL; 336 337 for (i = 0; i < pagevec_count(pvec); i++) { 338 struct page *page = pvec->pages[i]; 339 struct zone *pagezone = page_zone(page); 340 341 if (pagezone != zone) { 342 if (zone) 343 spin_unlock_irq(&zone->lru_lock); 344 zone = pagezone; 345 spin_lock_irq(&zone->lru_lock); 346 } 347 BUG_ON(PageLRU(page)); 348 SetPageLRU(page); 349 add_page_to_inactive_list(zone, page); 350 } 351 if (zone) 352 spin_unlock_irq(&zone->lru_lock); 353 release_pages(pvec->pages, pvec->nr, pvec->cold); 354 pagevec_reinit(pvec); 355} 356 357EXPORT_SYMBOL(__pagevec_lru_add); 358 359void __pagevec_lru_add_active(struct pagevec *pvec) 360{ 361 int i; 362 struct zone *zone = NULL; 363 364 for (i = 0; i < pagevec_count(pvec); i++) { 365 struct page *page = pvec->pages[i]; 366 struct zone *pagezone = page_zone(page); 367 368 if (pagezone != zone) { 369 if (zone) 370 spin_unlock_irq(&zone->lru_lock); 371 zone = pagezone; 372 spin_lock_irq(&zone->lru_lock); 373 } 374 BUG_ON(PageLRU(page)); 375 SetPageLRU(page); 376 BUG_ON(PageActive(page)); 377 SetPageActive(page); 378 add_page_to_active_list(zone, page); 379 } 380 if (zone) 381 spin_unlock_irq(&zone->lru_lock); 382 release_pages(pvec->pages, pvec->nr, pvec->cold); 383 pagevec_reinit(pvec); 384} 385 386/* 387 * Try to drop buffers from the pages in a pagevec 388 */ 389void pagevec_strip(struct pagevec *pvec) 390{ 391 int i; 392 393 for (i = 0; i < pagevec_count(pvec); i++) { 394 struct page *page = pvec->pages[i]; 395 396 if (PagePrivate(page) && !TestSetPageLocked(page)) { 397 if (PagePrivate(page)) 398 try_to_release_page(page, 0); 399 unlock_page(page); 400 } 401 } 402} 403 404/** 405 * pagevec_lookup - gang pagecache lookup 406 * @pvec: Where the resulting pages are placed 407 * @mapping: The address_space to search 408 * @start: The starting page index 409 * @nr_pages: The maximum number of pages 410 * 411 * pagevec_lookup() will search for and return a group of up to @nr_pages pages 412 * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a 413 * reference against the pages in @pvec. 414 * 415 * The search returns a group of mapping-contiguous pages with ascending 416 * indexes. There may be holes in the indices due to not-present pages. 417 * 418 * pagevec_lookup() returns the number of pages which were found. 419 */ 420unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, 421 pgoff_t start, unsigned nr_pages) 422{ 423 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); 424 return pagevec_count(pvec); 425} 426 427EXPORT_SYMBOL(pagevec_lookup); 428 429unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 430 pgoff_t *index, int tag, unsigned nr_pages) 431{ 432 pvec->nr = find_get_pages_tag(mapping, index, tag, 433 nr_pages, pvec->pages); 434 return pagevec_count(pvec); 435} 436 437EXPORT_SYMBOL(pagevec_lookup_tag); 438 439#ifdef CONFIG_SMP 440/* 441 * We tolerate a little inaccuracy to avoid ping-ponging the counter between 442 * CPUs 443 */ 444#define ACCT_THRESHOLD max(16, NR_CPUS * 2) 445 446static DEFINE_PER_CPU(long, committed_space) = 0; 447 448void vm_acct_memory(long pages) 449{ 450 long *local; 451 452 preempt_disable(); 453 local = &__get_cpu_var(committed_space); 454 *local += pages; 455 if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) { 456 atomic_add(*local, &vm_committed_space); 457 *local = 0; 458 } 459 preempt_enable(); 460} 461 462#ifdef CONFIG_HOTPLUG_CPU 463 464/* Drop the CPU's cached committed space back into the central pool. */ 465static int cpu_swap_callback(struct notifier_block *nfb, 466 unsigned long action, 467 void *hcpu) 468{ 469 long *committed; 470 471 committed = &per_cpu(committed_space, (long)hcpu); 472 if (action == CPU_DEAD) { 473 atomic_add(*committed, &vm_committed_space); 474 *committed = 0; 475 __lru_add_drain((long)hcpu); 476 } 477 return NOTIFY_OK; 478} 479#endif /* CONFIG_HOTPLUG_CPU */ 480#endif /* CONFIG_SMP */ 481 482/* 483 * Perform any setup for the swap system 484 */ 485void __init swap_setup(void) 486{ 487 unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); 488 489 /* Use a smaller cluster for small-memory machines */ 490 if (megs < 16) 491 page_cluster = 2; 492 else 493 page_cluster = 3; 494 /* 495 * Right now other parts of the system means that we 496 * _really_ don't want to cluster much more 497 */ 498 hotcpu_notifier(cpu_swap_callback, 0); 499} 500