vmscan.c revision 8695949a1d7c99e039595db00af8e0fe4722307d
1/*
2 *  linux/mm/vmscan.c
3 *
4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
5 *
6 *  Swap reorganised 29.12.95, Stephen Tweedie.
7 *  kswapd added: 7.1.96  sct
8 *  Removed kswapd_ctl limits, and swap out as many pages as needed
9 *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
11 *  Multiqueue VM started 5.8.00, Rik van Riel.
12 */
13
14#include <linux/mm.h>
15#include <linux/module.h>
16#include <linux/slab.h>
17#include <linux/kernel_stat.h>
18#include <linux/swap.h>
19#include <linux/pagemap.h>
20#include <linux/init.h>
21#include <linux/highmem.h>
22#include <linux/file.h>
23#include <linux/writeback.h>
24#include <linux/blkdev.h>
25#include <linux/buffer_head.h>	/* for try_to_release_page(),
26					buffer_heads_over_limit */
27#include <linux/mm_inline.h>
28#include <linux/pagevec.h>
29#include <linux/backing-dev.h>
30#include <linux/rmap.h>
31#include <linux/topology.h>
32#include <linux/cpu.h>
33#include <linux/cpuset.h>
34#include <linux/notifier.h>
35#include <linux/rwsem.h>
36
37#include <asm/tlbflush.h>
38#include <asm/div64.h>
39
40#include <linux/swapops.h>
41
42/* possible outcome of pageout() */
43typedef enum {
44	/* failed to write page out, page is locked */
45	PAGE_KEEP,
46	/* move page to the active list, page is locked */
47	PAGE_ACTIVATE,
48	/* page has been sent to the disk successfully, page is unlocked */
49	PAGE_SUCCESS,
50	/* page is clean and locked */
51	PAGE_CLEAN,
52} pageout_t;
53
54struct scan_control {
55	/* Incremented by the number of inactive pages that were scanned */
56	unsigned long nr_scanned;
57
58	/* Incremented by the number of pages reclaimed */
59	unsigned long nr_reclaimed;
60
61	unsigned long nr_mapped;	/* From page_state */
62
63	/* This context's GFP mask */
64	gfp_t gfp_mask;
65
66	int may_writepage;
67
68	/* Can pages be swapped as part of reclaim? */
69	int may_swap;
70
71	/* This context's SWAP_CLUSTER_MAX. If freeing memory for
72	 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
73	 * In this context, it doesn't matter that we scan the
74	 * whole list at once. */
75	int swap_cluster_max;
76};
77
78/*
79 * The list of shrinker callbacks used by to apply pressure to
80 * ageable caches.
81 */
82struct shrinker {
83	shrinker_t		shrinker;
84	struct list_head	list;
85	int			seeks;	/* seeks to recreate an obj */
86	long			nr;	/* objs pending delete */
87};
88
89#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
90
91#ifdef ARCH_HAS_PREFETCH
92#define prefetch_prev_lru_page(_page, _base, _field)			\
93	do {								\
94		if ((_page)->lru.prev != _base) {			\
95			struct page *prev;				\
96									\
97			prev = lru_to_page(&(_page->lru));		\
98			prefetch(&prev->_field);			\
99		}							\
100	} while (0)
101#else
102#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
103#endif
104
105#ifdef ARCH_HAS_PREFETCHW
106#define prefetchw_prev_lru_page(_page, _base, _field)			\
107	do {								\
108		if ((_page)->lru.prev != _base) {			\
109			struct page *prev;				\
110									\
111			prev = lru_to_page(&(_page->lru));		\
112			prefetchw(&prev->_field);			\
113		}							\
114	} while (0)
115#else
116#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
117#endif
118
119/*
120 * From 0 .. 100.  Higher means more swappy.
121 */
122int vm_swappiness = 60;
123static long total_memory;
124
125static LIST_HEAD(shrinker_list);
126static DECLARE_RWSEM(shrinker_rwsem);
127
128/*
129 * Add a shrinker callback to be called from the vm
130 */
131struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker)
132{
133        struct shrinker *shrinker;
134
135        shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
136        if (shrinker) {
137	        shrinker->shrinker = theshrinker;
138	        shrinker->seeks = seeks;
139	        shrinker->nr = 0;
140	        down_write(&shrinker_rwsem);
141	        list_add_tail(&shrinker->list, &shrinker_list);
142	        up_write(&shrinker_rwsem);
143	}
144	return shrinker;
145}
146EXPORT_SYMBOL(set_shrinker);
147
148/*
149 * Remove one
150 */
151void remove_shrinker(struct shrinker *shrinker)
152{
153	down_write(&shrinker_rwsem);
154	list_del(&shrinker->list);
155	up_write(&shrinker_rwsem);
156	kfree(shrinker);
157}
158EXPORT_SYMBOL(remove_shrinker);
159
160#define SHRINK_BATCH 128
161/*
162 * Call the shrink functions to age shrinkable caches
163 *
164 * Here we assume it costs one seek to replace a lru page and that it also
165 * takes a seek to recreate a cache object.  With this in mind we age equal
166 * percentages of the lru and ageable caches.  This should balance the seeks
167 * generated by these structures.
168 *
169 * If the vm encounted mapped pages on the LRU it increase the pressure on
170 * slab to avoid swapping.
171 *
172 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
173 *
174 * `lru_pages' represents the number of on-LRU pages in all the zones which
175 * are eligible for the caller's allocation attempt.  It is used for balancing
176 * slab reclaim versus page reclaim.
177 *
178 * Returns the number of slab objects which we shrunk.
179 */
180int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages)
181{
182	struct shrinker *shrinker;
183	int ret = 0;
184
185	if (scanned == 0)
186		scanned = SWAP_CLUSTER_MAX;
187
188	if (!down_read_trylock(&shrinker_rwsem))
189		return 1;	/* Assume we'll be able to shrink next time */
190
191	list_for_each_entry(shrinker, &shrinker_list, list) {
192		unsigned long long delta;
193		unsigned long total_scan;
194		unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
195
196		delta = (4 * scanned) / shrinker->seeks;
197		delta *= max_pass;
198		do_div(delta, lru_pages + 1);
199		shrinker->nr += delta;
200		if (shrinker->nr < 0) {
201			printk(KERN_ERR "%s: nr=%ld\n",
202					__FUNCTION__, shrinker->nr);
203			shrinker->nr = max_pass;
204		}
205
206		/*
207		 * Avoid risking looping forever due to too large nr value:
208		 * never try to free more than twice the estimate number of
209		 * freeable entries.
210		 */
211		if (shrinker->nr > max_pass * 2)
212			shrinker->nr = max_pass * 2;
213
214		total_scan = shrinker->nr;
215		shrinker->nr = 0;
216
217		while (total_scan >= SHRINK_BATCH) {
218			long this_scan = SHRINK_BATCH;
219			int shrink_ret;
220			int nr_before;
221
222			nr_before = (*shrinker->shrinker)(0, gfp_mask);
223			shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
224			if (shrink_ret == -1)
225				break;
226			if (shrink_ret < nr_before)
227				ret += nr_before - shrink_ret;
228			mod_page_state(slabs_scanned, this_scan);
229			total_scan -= this_scan;
230
231			cond_resched();
232		}
233
234		shrinker->nr += total_scan;
235	}
236	up_read(&shrinker_rwsem);
237	return ret;
238}
239
240/* Called without lock on whether page is mapped, so answer is unstable */
241static inline int page_mapping_inuse(struct page *page)
242{
243	struct address_space *mapping;
244
245	/* Page is in somebody's page tables. */
246	if (page_mapped(page))
247		return 1;
248
249	/* Be more reluctant to reclaim swapcache than pagecache */
250	if (PageSwapCache(page))
251		return 1;
252
253	mapping = page_mapping(page);
254	if (!mapping)
255		return 0;
256
257	/* File is mmap'd by somebody? */
258	return mapping_mapped(mapping);
259}
260
261static inline int is_page_cache_freeable(struct page *page)
262{
263	return page_count(page) - !!PagePrivate(page) == 2;
264}
265
266static int may_write_to_queue(struct backing_dev_info *bdi)
267{
268	if (current->flags & PF_SWAPWRITE)
269		return 1;
270	if (!bdi_write_congested(bdi))
271		return 1;
272	if (bdi == current->backing_dev_info)
273		return 1;
274	return 0;
275}
276
277/*
278 * We detected a synchronous write error writing a page out.  Probably
279 * -ENOSPC.  We need to propagate that into the address_space for a subsequent
280 * fsync(), msync() or close().
281 *
282 * The tricky part is that after writepage we cannot touch the mapping: nothing
283 * prevents it from being freed up.  But we have a ref on the page and once
284 * that page is locked, the mapping is pinned.
285 *
286 * We're allowed to run sleeping lock_page() here because we know the caller has
287 * __GFP_FS.
288 */
289static void handle_write_error(struct address_space *mapping,
290				struct page *page, int error)
291{
292	lock_page(page);
293	if (page_mapping(page) == mapping) {
294		if (error == -ENOSPC)
295			set_bit(AS_ENOSPC, &mapping->flags);
296		else
297			set_bit(AS_EIO, &mapping->flags);
298	}
299	unlock_page(page);
300}
301
302/*
303 * pageout is called by shrink_list() for each dirty page. Calls ->writepage().
304 */
305static pageout_t pageout(struct page *page, struct address_space *mapping)
306{
307	/*
308	 * If the page is dirty, only perform writeback if that write
309	 * will be non-blocking.  To prevent this allocation from being
310	 * stalled by pagecache activity.  But note that there may be
311	 * stalls if we need to run get_block().  We could test
312	 * PagePrivate for that.
313	 *
314	 * If this process is currently in generic_file_write() against
315	 * this page's queue, we can perform writeback even if that
316	 * will block.
317	 *
318	 * If the page is swapcache, write it back even if that would
319	 * block, for some throttling. This happens by accident, because
320	 * swap_backing_dev_info is bust: it doesn't reflect the
321	 * congestion state of the swapdevs.  Easy to fix, if needed.
322	 * See swapfile.c:page_queue_congested().
323	 */
324	if (!is_page_cache_freeable(page))
325		return PAGE_KEEP;
326	if (!mapping) {
327		/*
328		 * Some data journaling orphaned pages can have
329		 * page->mapping == NULL while being dirty with clean buffers.
330		 */
331		if (PagePrivate(page)) {
332			if (try_to_free_buffers(page)) {
333				ClearPageDirty(page);
334				printk("%s: orphaned page\n", __FUNCTION__);
335				return PAGE_CLEAN;
336			}
337		}
338		return PAGE_KEEP;
339	}
340	if (mapping->a_ops->writepage == NULL)
341		return PAGE_ACTIVATE;
342	if (!may_write_to_queue(mapping->backing_dev_info))
343		return PAGE_KEEP;
344
345	if (clear_page_dirty_for_io(page)) {
346		int res;
347		struct writeback_control wbc = {
348			.sync_mode = WB_SYNC_NONE,
349			.nr_to_write = SWAP_CLUSTER_MAX,
350			.nonblocking = 1,
351			.for_reclaim = 1,
352		};
353
354		SetPageReclaim(page);
355		res = mapping->a_ops->writepage(page, &wbc);
356		if (res < 0)
357			handle_write_error(mapping, page, res);
358		if (res == AOP_WRITEPAGE_ACTIVATE) {
359			ClearPageReclaim(page);
360			return PAGE_ACTIVATE;
361		}
362		if (!PageWriteback(page)) {
363			/* synchronous write or broken a_ops? */
364			ClearPageReclaim(page);
365		}
366
367		return PAGE_SUCCESS;
368	}
369
370	return PAGE_CLEAN;
371}
372
373static int remove_mapping(struct address_space *mapping, struct page *page)
374{
375	if (!mapping)
376		return 0;		/* truncate got there first */
377
378	write_lock_irq(&mapping->tree_lock);
379
380	/*
381	 * The non-racy check for busy page.  It is critical to check
382	 * PageDirty _after_ making sure that the page is freeable and
383	 * not in use by anybody. 	(pagecache + us == 2)
384	 */
385	if (unlikely(page_count(page) != 2))
386		goto cannot_free;
387	smp_rmb();
388	if (unlikely(PageDirty(page)))
389		goto cannot_free;
390
391	if (PageSwapCache(page)) {
392		swp_entry_t swap = { .val = page_private(page) };
393		__delete_from_swap_cache(page);
394		write_unlock_irq(&mapping->tree_lock);
395		swap_free(swap);
396		__put_page(page);	/* The pagecache ref */
397		return 1;
398	}
399
400	__remove_from_page_cache(page);
401	write_unlock_irq(&mapping->tree_lock);
402	__put_page(page);
403	return 1;
404
405cannot_free:
406	write_unlock_irq(&mapping->tree_lock);
407	return 0;
408}
409
410/*
411 * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
412 */
413static int shrink_list(struct list_head *page_list, struct scan_control *sc)
414{
415	LIST_HEAD(ret_pages);
416	struct pagevec freed_pvec;
417	int pgactivate = 0;
418	int reclaimed = 0;
419
420	cond_resched();
421
422	pagevec_init(&freed_pvec, 1);
423	while (!list_empty(page_list)) {
424		struct address_space *mapping;
425		struct page *page;
426		int may_enter_fs;
427		int referenced;
428
429		cond_resched();
430
431		page = lru_to_page(page_list);
432		list_del(&page->lru);
433
434		if (TestSetPageLocked(page))
435			goto keep;
436
437		BUG_ON(PageActive(page));
438
439		sc->nr_scanned++;
440
441		if (!sc->may_swap && page_mapped(page))
442			goto keep_locked;
443
444		/* Double the slab pressure for mapped and swapcache pages */
445		if (page_mapped(page) || PageSwapCache(page))
446			sc->nr_scanned++;
447
448		if (PageWriteback(page))
449			goto keep_locked;
450
451		referenced = page_referenced(page, 1);
452		/* In active use or really unfreeable?  Activate it. */
453		if (referenced && page_mapping_inuse(page))
454			goto activate_locked;
455
456#ifdef CONFIG_SWAP
457		/*
458		 * Anonymous process memory has backing store?
459		 * Try to allocate it some swap space here.
460		 */
461		if (PageAnon(page) && !PageSwapCache(page)) {
462			if (!sc->may_swap)
463				goto keep_locked;
464			if (!add_to_swap(page, GFP_ATOMIC))
465				goto activate_locked;
466		}
467#endif /* CONFIG_SWAP */
468
469		mapping = page_mapping(page);
470		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
471			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
472
473		/*
474		 * The page is mapped into the page tables of one or more
475		 * processes. Try to unmap it here.
476		 */
477		if (page_mapped(page) && mapping) {
478			/*
479			 * No unmapping if we do not swap
480			 */
481			if (!sc->may_swap)
482				goto keep_locked;
483
484			switch (try_to_unmap(page, 0)) {
485			case SWAP_FAIL:
486				goto activate_locked;
487			case SWAP_AGAIN:
488				goto keep_locked;
489			case SWAP_SUCCESS:
490				; /* try to free the page below */
491			}
492		}
493
494		if (PageDirty(page)) {
495			if (referenced)
496				goto keep_locked;
497			if (!may_enter_fs)
498				goto keep_locked;
499			if (!sc->may_writepage)
500				goto keep_locked;
501
502			/* Page is dirty, try to write it out here */
503			switch(pageout(page, mapping)) {
504			case PAGE_KEEP:
505				goto keep_locked;
506			case PAGE_ACTIVATE:
507				goto activate_locked;
508			case PAGE_SUCCESS:
509				if (PageWriteback(page) || PageDirty(page))
510					goto keep;
511				/*
512				 * A synchronous write - probably a ramdisk.  Go
513				 * ahead and try to reclaim the page.
514				 */
515				if (TestSetPageLocked(page))
516					goto keep;
517				if (PageDirty(page) || PageWriteback(page))
518					goto keep_locked;
519				mapping = page_mapping(page);
520			case PAGE_CLEAN:
521				; /* try to free the page below */
522			}
523		}
524
525		/*
526		 * If the page has buffers, try to free the buffer mappings
527		 * associated with this page. If we succeed we try to free
528		 * the page as well.
529		 *
530		 * We do this even if the page is PageDirty().
531		 * try_to_release_page() does not perform I/O, but it is
532		 * possible for a page to have PageDirty set, but it is actually
533		 * clean (all its buffers are clean).  This happens if the
534		 * buffers were written out directly, with submit_bh(). ext3
535		 * will do this, as well as the blockdev mapping.
536		 * try_to_release_page() will discover that cleanness and will
537		 * drop the buffers and mark the page clean - it can be freed.
538		 *
539		 * Rarely, pages can have buffers and no ->mapping.  These are
540		 * the pages which were not successfully invalidated in
541		 * truncate_complete_page().  We try to drop those buffers here
542		 * and if that worked, and the page is no longer mapped into
543		 * process address space (page_count == 1) it can be freed.
544		 * Otherwise, leave the page on the LRU so it is swappable.
545		 */
546		if (PagePrivate(page)) {
547			if (!try_to_release_page(page, sc->gfp_mask))
548				goto activate_locked;
549			if (!mapping && page_count(page) == 1)
550				goto free_it;
551		}
552
553		if (!remove_mapping(mapping, page))
554			goto keep_locked;
555
556free_it:
557		unlock_page(page);
558		reclaimed++;
559		if (!pagevec_add(&freed_pvec, page))
560			__pagevec_release_nonlru(&freed_pvec);
561		continue;
562
563activate_locked:
564		SetPageActive(page);
565		pgactivate++;
566keep_locked:
567		unlock_page(page);
568keep:
569		list_add(&page->lru, &ret_pages);
570		BUG_ON(PageLRU(page));
571	}
572	list_splice(&ret_pages, page_list);
573	if (pagevec_count(&freed_pvec))
574		__pagevec_release_nonlru(&freed_pvec);
575	mod_page_state(pgactivate, pgactivate);
576	sc->nr_reclaimed += reclaimed;
577	return reclaimed;
578}
579
580#ifdef CONFIG_MIGRATION
581static inline void move_to_lru(struct page *page)
582{
583	list_del(&page->lru);
584	if (PageActive(page)) {
585		/*
586		 * lru_cache_add_active checks that
587		 * the PG_active bit is off.
588		 */
589		ClearPageActive(page);
590		lru_cache_add_active(page);
591	} else {
592		lru_cache_add(page);
593	}
594	put_page(page);
595}
596
597/*
598 * Add isolated pages on the list back to the LRU.
599 *
600 * returns the number of pages put back.
601 */
602int putback_lru_pages(struct list_head *l)
603{
604	struct page *page;
605	struct page *page2;
606	int count = 0;
607
608	list_for_each_entry_safe(page, page2, l, lru) {
609		move_to_lru(page);
610		count++;
611	}
612	return count;
613}
614
615/*
616 * Non migratable page
617 */
618int fail_migrate_page(struct page *newpage, struct page *page)
619{
620	return -EIO;
621}
622EXPORT_SYMBOL(fail_migrate_page);
623
624/*
625 * swapout a single page
626 * page is locked upon entry, unlocked on exit
627 */
628static int swap_page(struct page *page)
629{
630	struct address_space *mapping = page_mapping(page);
631
632	if (page_mapped(page) && mapping)
633		if (try_to_unmap(page, 1) != SWAP_SUCCESS)
634			goto unlock_retry;
635
636	if (PageDirty(page)) {
637		/* Page is dirty, try to write it out here */
638		switch(pageout(page, mapping)) {
639		case PAGE_KEEP:
640		case PAGE_ACTIVATE:
641			goto unlock_retry;
642
643		case PAGE_SUCCESS:
644			goto retry;
645
646		case PAGE_CLEAN:
647			; /* try to free the page below */
648		}
649	}
650
651	if (PagePrivate(page)) {
652		if (!try_to_release_page(page, GFP_KERNEL) ||
653		    (!mapping && page_count(page) == 1))
654			goto unlock_retry;
655	}
656
657	if (remove_mapping(mapping, page)) {
658		/* Success */
659		unlock_page(page);
660		return 0;
661	}
662
663unlock_retry:
664	unlock_page(page);
665
666retry:
667	return -EAGAIN;
668}
669EXPORT_SYMBOL(swap_page);
670
671/*
672 * Page migration was first developed in the context of the memory hotplug
673 * project. The main authors of the migration code are:
674 *
675 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
676 * Hirokazu Takahashi <taka@valinux.co.jp>
677 * Dave Hansen <haveblue@us.ibm.com>
678 * Christoph Lameter <clameter@sgi.com>
679 */
680
681/*
682 * Remove references for a page and establish the new page with the correct
683 * basic settings to be able to stop accesses to the page.
684 */
685int migrate_page_remove_references(struct page *newpage,
686				struct page *page, int nr_refs)
687{
688	struct address_space *mapping = page_mapping(page);
689	struct page **radix_pointer;
690
691	/*
692	 * Avoid doing any of the following work if the page count
693	 * indicates that the page is in use or truncate has removed
694	 * the page.
695	 */
696	if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
697		return -EAGAIN;
698
699	/*
700	 * Establish swap ptes for anonymous pages or destroy pte
701	 * maps for files.
702	 *
703	 * In order to reestablish file backed mappings the fault handlers
704	 * will take the radix tree_lock which may then be used to stop
705  	 * processses from accessing this page until the new page is ready.
706	 *
707	 * A process accessing via a swap pte (an anonymous page) will take a
708	 * page_lock on the old page which will block the process until the
709	 * migration attempt is complete. At that time the PageSwapCache bit
710	 * will be examined. If the page was migrated then the PageSwapCache
711	 * bit will be clear and the operation to retrieve the page will be
712	 * retried which will find the new page in the radix tree. Then a new
713	 * direct mapping may be generated based on the radix tree contents.
714	 *
715	 * If the page was not migrated then the PageSwapCache bit
716	 * is still set and the operation may continue.
717	 */
718	if (try_to_unmap(page, 1) == SWAP_FAIL)
719		/* A vma has VM_LOCKED set -> Permanent failure */
720		return -EPERM;
721
722	/*
723	 * Give up if we were unable to remove all mappings.
724	 */
725	if (page_mapcount(page))
726		return -EAGAIN;
727
728	write_lock_irq(&mapping->tree_lock);
729
730	radix_pointer = (struct page **)radix_tree_lookup_slot(
731						&mapping->page_tree,
732						page_index(page));
733
734	if (!page_mapping(page) || page_count(page) != nr_refs ||
735			*radix_pointer != page) {
736		write_unlock_irq(&mapping->tree_lock);
737		return -EAGAIN;
738	}
739
740	/*
741	 * Now we know that no one else is looking at the page.
742	 *
743	 * Certain minimal information about a page must be available
744	 * in order for other subsystems to properly handle the page if they
745	 * find it through the radix tree update before we are finished
746	 * copying the page.
747	 */
748	get_page(newpage);
749	newpage->index = page->index;
750	newpage->mapping = page->mapping;
751	if (PageSwapCache(page)) {
752		SetPageSwapCache(newpage);
753		set_page_private(newpage, page_private(page));
754	}
755
756	*radix_pointer = newpage;
757	__put_page(page);
758	write_unlock_irq(&mapping->tree_lock);
759
760	return 0;
761}
762EXPORT_SYMBOL(migrate_page_remove_references);
763
764/*
765 * Copy the page to its new location
766 */
767void migrate_page_copy(struct page *newpage, struct page *page)
768{
769	copy_highpage(newpage, page);
770
771	if (PageError(page))
772		SetPageError(newpage);
773	if (PageReferenced(page))
774		SetPageReferenced(newpage);
775	if (PageUptodate(page))
776		SetPageUptodate(newpage);
777	if (PageActive(page))
778		SetPageActive(newpage);
779	if (PageChecked(page))
780		SetPageChecked(newpage);
781	if (PageMappedToDisk(page))
782		SetPageMappedToDisk(newpage);
783
784	if (PageDirty(page)) {
785		clear_page_dirty_for_io(page);
786		set_page_dirty(newpage);
787 	}
788
789	ClearPageSwapCache(page);
790	ClearPageActive(page);
791	ClearPagePrivate(page);
792	set_page_private(page, 0);
793	page->mapping = NULL;
794
795	/*
796	 * If any waiters have accumulated on the new page then
797	 * wake them up.
798	 */
799	if (PageWriteback(newpage))
800		end_page_writeback(newpage);
801}
802EXPORT_SYMBOL(migrate_page_copy);
803
804/*
805 * Common logic to directly migrate a single page suitable for
806 * pages that do not use PagePrivate.
807 *
808 * Pages are locked upon entry and exit.
809 */
810int migrate_page(struct page *newpage, struct page *page)
811{
812	int rc;
813
814	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
815
816	rc = migrate_page_remove_references(newpage, page, 2);
817
818	if (rc)
819		return rc;
820
821	migrate_page_copy(newpage, page);
822
823	/*
824	 * Remove auxiliary swap entries and replace
825	 * them with real ptes.
826	 *
827	 * Note that a real pte entry will allow processes that are not
828	 * waiting on the page lock to use the new page via the page tables
829	 * before the new page is unlocked.
830	 */
831	remove_from_swap(newpage);
832	return 0;
833}
834EXPORT_SYMBOL(migrate_page);
835
836/*
837 * migrate_pages
838 *
839 * Two lists are passed to this function. The first list
840 * contains the pages isolated from the LRU to be migrated.
841 * The second list contains new pages that the pages isolated
842 * can be moved to. If the second list is NULL then all
843 * pages are swapped out.
844 *
845 * The function returns after 10 attempts or if no pages
846 * are movable anymore because to has become empty
847 * or no retryable pages exist anymore.
848 *
849 * Return: Number of pages not migrated when "to" ran empty.
850 */
851int migrate_pages(struct list_head *from, struct list_head *to,
852		  struct list_head *moved, struct list_head *failed)
853{
854	int retry;
855	int nr_failed = 0;
856	int pass = 0;
857	struct page *page;
858	struct page *page2;
859	int swapwrite = current->flags & PF_SWAPWRITE;
860	int rc;
861
862	if (!swapwrite)
863		current->flags |= PF_SWAPWRITE;
864
865redo:
866	retry = 0;
867
868	list_for_each_entry_safe(page, page2, from, lru) {
869		struct page *newpage = NULL;
870		struct address_space *mapping;
871
872		cond_resched();
873
874		rc = 0;
875		if (page_count(page) == 1)
876			/* page was freed from under us. So we are done. */
877			goto next;
878
879		if (to && list_empty(to))
880			break;
881
882		/*
883		 * Skip locked pages during the first two passes to give the
884		 * functions holding the lock time to release the page. Later we
885		 * use lock_page() to have a higher chance of acquiring the
886		 * lock.
887		 */
888		rc = -EAGAIN;
889		if (pass > 2)
890			lock_page(page);
891		else
892			if (TestSetPageLocked(page))
893				goto next;
894
895		/*
896		 * Only wait on writeback if we have already done a pass where
897		 * we we may have triggered writeouts for lots of pages.
898		 */
899		if (pass > 0) {
900			wait_on_page_writeback(page);
901		} else {
902			if (PageWriteback(page))
903				goto unlock_page;
904		}
905
906		/*
907		 * Anonymous pages must have swap cache references otherwise
908		 * the information contained in the page maps cannot be
909		 * preserved.
910		 */
911		if (PageAnon(page) && !PageSwapCache(page)) {
912			if (!add_to_swap(page, GFP_KERNEL)) {
913				rc = -ENOMEM;
914				goto unlock_page;
915			}
916		}
917
918		if (!to) {
919			rc = swap_page(page);
920			goto next;
921		}
922
923		newpage = lru_to_page(to);
924		lock_page(newpage);
925
926		/*
927		 * Pages are properly locked and writeback is complete.
928		 * Try to migrate the page.
929		 */
930		mapping = page_mapping(page);
931		if (!mapping)
932			goto unlock_both;
933
934		if (mapping->a_ops->migratepage) {
935			/*
936			 * Most pages have a mapping and most filesystems
937			 * should provide a migration function. Anonymous
938			 * pages are part of swap space which also has its
939			 * own migration function. This is the most common
940			 * path for page migration.
941			 */
942			rc = mapping->a_ops->migratepage(newpage, page);
943			goto unlock_both;
944                }
945
946		/*
947		 * Default handling if a filesystem does not provide
948		 * a migration function. We can only migrate clean
949		 * pages so try to write out any dirty pages first.
950		 */
951		if (PageDirty(page)) {
952			switch (pageout(page, mapping)) {
953			case PAGE_KEEP:
954			case PAGE_ACTIVATE:
955				goto unlock_both;
956
957			case PAGE_SUCCESS:
958				unlock_page(newpage);
959				goto next;
960
961			case PAGE_CLEAN:
962				; /* try to migrate the page below */
963			}
964                }
965
966		/*
967		 * Buffers are managed in a filesystem specific way.
968		 * We must have no buffers or drop them.
969		 */
970		if (!page_has_buffers(page) ||
971		    try_to_release_page(page, GFP_KERNEL)) {
972			rc = migrate_page(newpage, page);
973			goto unlock_both;
974		}
975
976		/*
977		 * On early passes with mapped pages simply
978		 * retry. There may be a lock held for some
979		 * buffers that may go away. Later
980		 * swap them out.
981		 */
982		if (pass > 4) {
983			/*
984			 * Persistently unable to drop buffers..... As a
985			 * measure of last resort we fall back to
986			 * swap_page().
987			 */
988			unlock_page(newpage);
989			newpage = NULL;
990			rc = swap_page(page);
991			goto next;
992		}
993
994unlock_both:
995		unlock_page(newpage);
996
997unlock_page:
998		unlock_page(page);
999
1000next:
1001		if (rc == -EAGAIN) {
1002			retry++;
1003		} else if (rc) {
1004			/* Permanent failure */
1005			list_move(&page->lru, failed);
1006			nr_failed++;
1007		} else {
1008			if (newpage) {
1009				/* Successful migration. Return page to LRU */
1010				move_to_lru(newpage);
1011			}
1012			list_move(&page->lru, moved);
1013		}
1014	}
1015	if (retry && pass++ < 10)
1016		goto redo;
1017
1018	if (!swapwrite)
1019		current->flags &= ~PF_SWAPWRITE;
1020
1021	return nr_failed + retry;
1022}
1023
1024/*
1025 * Isolate one page from the LRU lists and put it on the
1026 * indicated list with elevated refcount.
1027 *
1028 * Result:
1029 *  0 = page not on LRU list
1030 *  1 = page removed from LRU list and added to the specified list.
1031 */
1032int isolate_lru_page(struct page *page)
1033{
1034	int ret = 0;
1035
1036	if (PageLRU(page)) {
1037		struct zone *zone = page_zone(page);
1038		spin_lock_irq(&zone->lru_lock);
1039		if (PageLRU(page)) {
1040			ret = 1;
1041			get_page(page);
1042			ClearPageLRU(page);
1043			if (PageActive(page))
1044				del_page_from_active_list(zone, page);
1045			else
1046				del_page_from_inactive_list(zone, page);
1047		}
1048		spin_unlock_irq(&zone->lru_lock);
1049	}
1050
1051	return ret;
1052}
1053#endif
1054
1055/*
1056 * zone->lru_lock is heavily contended.  Some of the functions that
1057 * shrink the lists perform better by taking out a batch of pages
1058 * and working on them outside the LRU lock.
1059 *
1060 * For pagecache intensive workloads, this function is the hottest
1061 * spot in the kernel (apart from copy_*_user functions).
1062 *
1063 * Appropriate locks must be held before calling this function.
1064 *
1065 * @nr_to_scan:	The number of pages to look through on the list.
1066 * @src:	The LRU list to pull pages off.
1067 * @dst:	The temp list to put pages on to.
1068 * @scanned:	The number of pages that were scanned.
1069 *
1070 * returns how many pages were moved onto *@dst.
1071 */
1072static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
1073			     struct list_head *dst, int *scanned)
1074{
1075	int nr_taken = 0;
1076	struct page *page;
1077	int scan = 0;
1078
1079	while (scan++ < nr_to_scan && !list_empty(src)) {
1080		struct list_head *target;
1081		page = lru_to_page(src);
1082		prefetchw_prev_lru_page(page, src, flags);
1083
1084		BUG_ON(!PageLRU(page));
1085
1086		list_del(&page->lru);
1087		target = src;
1088		if (likely(get_page_unless_zero(page))) {
1089			/*
1090			 * Be careful not to clear PageLRU until after we're
1091			 * sure the page is not being freed elsewhere -- the
1092			 * page release code relies on it.
1093			 */
1094			ClearPageLRU(page);
1095			target = dst;
1096			nr_taken++;
1097		} /* else it is being freed elsewhere */
1098
1099		list_add(&page->lru, target);
1100	}
1101
1102	*scanned = scan;
1103	return nr_taken;
1104}
1105
1106/*
1107 * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
1108 */
1109static void shrink_cache(int max_scan, struct zone *zone, struct scan_control *sc)
1110{
1111	LIST_HEAD(page_list);
1112	struct pagevec pvec;
1113
1114	pagevec_init(&pvec, 1);
1115
1116	lru_add_drain();
1117	spin_lock_irq(&zone->lru_lock);
1118	while (max_scan > 0) {
1119		struct page *page;
1120		int nr_taken;
1121		int nr_scan;
1122		int nr_freed;
1123
1124		nr_taken = isolate_lru_pages(sc->swap_cluster_max,
1125					     &zone->inactive_list,
1126					     &page_list, &nr_scan);
1127		zone->nr_inactive -= nr_taken;
1128		zone->pages_scanned += nr_scan;
1129		spin_unlock_irq(&zone->lru_lock);
1130
1131		if (nr_taken == 0)
1132			goto done;
1133
1134		max_scan -= nr_scan;
1135		nr_freed = shrink_list(&page_list, sc);
1136
1137		local_irq_disable();
1138		if (current_is_kswapd()) {
1139			__mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
1140			__mod_page_state(kswapd_steal, nr_freed);
1141		} else
1142			__mod_page_state_zone(zone, pgscan_direct, nr_scan);
1143		__mod_page_state_zone(zone, pgsteal, nr_freed);
1144
1145		spin_lock(&zone->lru_lock);
1146		/*
1147		 * Put back any unfreeable pages.
1148		 */
1149		while (!list_empty(&page_list)) {
1150			page = lru_to_page(&page_list);
1151			BUG_ON(PageLRU(page));
1152			SetPageLRU(page);
1153			list_del(&page->lru);
1154			if (PageActive(page))
1155				add_page_to_active_list(zone, page);
1156			else
1157				add_page_to_inactive_list(zone, page);
1158			if (!pagevec_add(&pvec, page)) {
1159				spin_unlock_irq(&zone->lru_lock);
1160				__pagevec_release(&pvec);
1161				spin_lock_irq(&zone->lru_lock);
1162			}
1163		}
1164  	}
1165	spin_unlock_irq(&zone->lru_lock);
1166done:
1167	pagevec_release(&pvec);
1168}
1169
1170/*
1171 * This moves pages from the active list to the inactive list.
1172 *
1173 * We move them the other way if the page is referenced by one or more
1174 * processes, from rmap.
1175 *
1176 * If the pages are mostly unmapped, the processing is fast and it is
1177 * appropriate to hold zone->lru_lock across the whole operation.  But if
1178 * the pages are mapped, the processing is slow (page_referenced()) so we
1179 * should drop zone->lru_lock around each page.  It's impossible to balance
1180 * this, so instead we remove the pages from the LRU while processing them.
1181 * It is safe to rely on PG_active against the non-LRU pages in here because
1182 * nobody will play with that bit on a non-LRU page.
1183 *
1184 * The downside is that we have to touch page->_count against each page.
1185 * But we had to alter page->flags anyway.
1186 */
1187static void
1188refill_inactive_zone(int nr_pages, struct zone *zone, struct scan_control *sc)
1189{
1190	int pgmoved;
1191	int pgdeactivate = 0;
1192	int pgscanned;
1193	LIST_HEAD(l_hold);	/* The pages which were snipped off */
1194	LIST_HEAD(l_inactive);	/* Pages to go onto the inactive_list */
1195	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
1196	struct page *page;
1197	struct pagevec pvec;
1198	int reclaim_mapped = 0;
1199
1200	if (unlikely(sc->may_swap)) {
1201		long mapped_ratio;
1202		long distress;
1203		long swap_tendency;
1204
1205		/*
1206		 * `distress' is a measure of how much trouble we're having
1207		 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
1208		 */
1209		distress = 100 >> zone->prev_priority;
1210
1211		/*
1212		 * The point of this algorithm is to decide when to start
1213		 * reclaiming mapped memory instead of just pagecache.  Work out
1214		 * how much memory
1215		 * is mapped.
1216		 */
1217		mapped_ratio = (sc->nr_mapped * 100) / total_memory;
1218
1219		/*
1220		 * Now decide how much we really want to unmap some pages.  The
1221		 * mapped ratio is downgraded - just because there's a lot of
1222		 * mapped memory doesn't necessarily mean that page reclaim
1223		 * isn't succeeding.
1224		 *
1225		 * The distress ratio is important - we don't want to start
1226		 * going oom.
1227		 *
1228		 * A 100% value of vm_swappiness overrides this algorithm
1229		 * altogether.
1230		 */
1231		swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
1232
1233		/*
1234		 * Now use this metric to decide whether to start moving mapped
1235		 * memory onto the inactive list.
1236		 */
1237		if (swap_tendency >= 100)
1238			reclaim_mapped = 1;
1239	}
1240
1241	lru_add_drain();
1242	spin_lock_irq(&zone->lru_lock);
1243	pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
1244				    &l_hold, &pgscanned);
1245	zone->pages_scanned += pgscanned;
1246	zone->nr_active -= pgmoved;
1247	spin_unlock_irq(&zone->lru_lock);
1248
1249	while (!list_empty(&l_hold)) {
1250		cond_resched();
1251		page = lru_to_page(&l_hold);
1252		list_del(&page->lru);
1253		if (page_mapped(page)) {
1254			if (!reclaim_mapped ||
1255			    (total_swap_pages == 0 && PageAnon(page)) ||
1256			    page_referenced(page, 0)) {
1257				list_add(&page->lru, &l_active);
1258				continue;
1259			}
1260		}
1261		list_add(&page->lru, &l_inactive);
1262	}
1263
1264	pagevec_init(&pvec, 1);
1265	pgmoved = 0;
1266	spin_lock_irq(&zone->lru_lock);
1267	while (!list_empty(&l_inactive)) {
1268		page = lru_to_page(&l_inactive);
1269		prefetchw_prev_lru_page(page, &l_inactive, flags);
1270		BUG_ON(PageLRU(page));
1271		SetPageLRU(page);
1272		BUG_ON(!PageActive(page));
1273		ClearPageActive(page);
1274
1275		list_move(&page->lru, &zone->inactive_list);
1276		pgmoved++;
1277		if (!pagevec_add(&pvec, page)) {
1278			zone->nr_inactive += pgmoved;
1279			spin_unlock_irq(&zone->lru_lock);
1280			pgdeactivate += pgmoved;
1281			pgmoved = 0;
1282			if (buffer_heads_over_limit)
1283				pagevec_strip(&pvec);
1284			__pagevec_release(&pvec);
1285			spin_lock_irq(&zone->lru_lock);
1286		}
1287	}
1288	zone->nr_inactive += pgmoved;
1289	pgdeactivate += pgmoved;
1290	if (buffer_heads_over_limit) {
1291		spin_unlock_irq(&zone->lru_lock);
1292		pagevec_strip(&pvec);
1293		spin_lock_irq(&zone->lru_lock);
1294	}
1295
1296	pgmoved = 0;
1297	while (!list_empty(&l_active)) {
1298		page = lru_to_page(&l_active);
1299		prefetchw_prev_lru_page(page, &l_active, flags);
1300		BUG_ON(PageLRU(page));
1301		SetPageLRU(page);
1302		BUG_ON(!PageActive(page));
1303		list_move(&page->lru, &zone->active_list);
1304		pgmoved++;
1305		if (!pagevec_add(&pvec, page)) {
1306			zone->nr_active += pgmoved;
1307			pgmoved = 0;
1308			spin_unlock_irq(&zone->lru_lock);
1309			__pagevec_release(&pvec);
1310			spin_lock_irq(&zone->lru_lock);
1311		}
1312	}
1313	zone->nr_active += pgmoved;
1314	spin_unlock(&zone->lru_lock);
1315
1316	__mod_page_state_zone(zone, pgrefill, pgscanned);
1317	__mod_page_state(pgdeactivate, pgdeactivate);
1318	local_irq_enable();
1319
1320	pagevec_release(&pvec);
1321}
1322
1323/*
1324 * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
1325 */
1326static void
1327shrink_zone(int priority, struct zone *zone, struct scan_control *sc)
1328{
1329	unsigned long nr_active;
1330	unsigned long nr_inactive;
1331	unsigned long nr_to_scan;
1332
1333	atomic_inc(&zone->reclaim_in_progress);
1334
1335	/*
1336	 * Add one to `nr_to_scan' just to make sure that the kernel will
1337	 * slowly sift through the active list.
1338	 */
1339	zone->nr_scan_active += (zone->nr_active >> priority) + 1;
1340	nr_active = zone->nr_scan_active;
1341	if (nr_active >= sc->swap_cluster_max)
1342		zone->nr_scan_active = 0;
1343	else
1344		nr_active = 0;
1345
1346	zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
1347	nr_inactive = zone->nr_scan_inactive;
1348	if (nr_inactive >= sc->swap_cluster_max)
1349		zone->nr_scan_inactive = 0;
1350	else
1351		nr_inactive = 0;
1352
1353	while (nr_active || nr_inactive) {
1354		if (nr_active) {
1355			nr_to_scan = min(nr_active,
1356					(unsigned long)sc->swap_cluster_max);
1357			nr_active -= nr_to_scan;
1358			refill_inactive_zone(nr_to_scan, zone, sc);
1359		}
1360
1361		if (nr_inactive) {
1362			nr_to_scan = min(nr_inactive,
1363					(unsigned long)sc->swap_cluster_max);
1364			nr_inactive -= nr_to_scan;
1365			shrink_cache(nr_to_scan, zone, sc);
1366		}
1367	}
1368
1369	throttle_vm_writeout();
1370
1371	atomic_dec(&zone->reclaim_in_progress);
1372}
1373
1374/*
1375 * This is the direct reclaim path, for page-allocating processes.  We only
1376 * try to reclaim pages from zones which will satisfy the caller's allocation
1377 * request.
1378 *
1379 * We reclaim from a zone even if that zone is over pages_high.  Because:
1380 * a) The caller may be trying to free *extra* pages to satisfy a higher-order
1381 *    allocation or
1382 * b) The zones may be over pages_high but they must go *over* pages_high to
1383 *    satisfy the `incremental min' zone defense algorithm.
1384 *
1385 * Returns the number of reclaimed pages.
1386 *
1387 * If a zone is deemed to be full of pinned pages then just give it a light
1388 * scan then give up on it.
1389 */
1390static void
1391shrink_caches(int priority, struct zone **zones, struct scan_control *sc)
1392{
1393	int i;
1394
1395	for (i = 0; zones[i] != NULL; i++) {
1396		struct zone *zone = zones[i];
1397
1398		if (!populated_zone(zone))
1399			continue;
1400
1401		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1402			continue;
1403
1404		zone->temp_priority = priority;
1405		if (zone->prev_priority > priority)
1406			zone->prev_priority = priority;
1407
1408		if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1409			continue;	/* Let kswapd poll it */
1410
1411		shrink_zone(priority, zone, sc);
1412	}
1413}
1414
1415/*
1416 * This is the main entry point to direct page reclaim.
1417 *
1418 * If a full scan of the inactive list fails to free enough memory then we
1419 * are "out of memory" and something needs to be killed.
1420 *
1421 * If the caller is !__GFP_FS then the probability of a failure is reasonably
1422 * high - the zone may be full of dirty or under-writeback pages, which this
1423 * caller can't do much about.  We kick pdflush and take explicit naps in the
1424 * hope that some of these pages can be written.  But if the allocating task
1425 * holds filesystem locks which prevent writeout this might not work, and the
1426 * allocation attempt will fail.
1427 */
1428int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1429{
1430	int priority;
1431	int ret = 0;
1432	int total_scanned = 0, total_reclaimed = 0;
1433	struct reclaim_state *reclaim_state = current->reclaim_state;
1434	struct scan_control sc;
1435	unsigned long lru_pages = 0;
1436	int i;
1437
1438	sc.gfp_mask = gfp_mask;
1439	sc.may_writepage = !laptop_mode;
1440	sc.may_swap = 1;
1441
1442	inc_page_state(allocstall);
1443
1444	for (i = 0; zones[i] != NULL; i++) {
1445		struct zone *zone = zones[i];
1446
1447		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1448			continue;
1449
1450		zone->temp_priority = DEF_PRIORITY;
1451		lru_pages += zone->nr_active + zone->nr_inactive;
1452	}
1453
1454	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1455		sc.nr_mapped = read_page_state(nr_mapped);
1456		sc.nr_scanned = 0;
1457		sc.nr_reclaimed = 0;
1458		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
1459		if (!priority)
1460			disable_swap_token();
1461		shrink_caches(priority, zones, &sc);
1462		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
1463		if (reclaim_state) {
1464			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
1465			reclaim_state->reclaimed_slab = 0;
1466		}
1467		total_scanned += sc.nr_scanned;
1468		total_reclaimed += sc.nr_reclaimed;
1469		if (total_reclaimed >= sc.swap_cluster_max) {
1470			ret = 1;
1471			goto out;
1472		}
1473
1474		/*
1475		 * Try to write back as many pages as we just scanned.  This
1476		 * tends to cause slow streaming writers to write data to the
1477		 * disk smoothly, at the dirtying rate, which is nice.   But
1478		 * that's undesirable in laptop mode, where we *want* lumpy
1479		 * writeout.  So in laptop mode, write out the whole world.
1480		 */
1481		if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) {
1482			wakeup_pdflush(laptop_mode ? 0 : total_scanned);
1483			sc.may_writepage = 1;
1484		}
1485
1486		/* Take a nap, wait for some writeback to complete */
1487		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
1488			blk_congestion_wait(WRITE, HZ/10);
1489	}
1490out:
1491	for (i = 0; zones[i] != 0; i++) {
1492		struct zone *zone = zones[i];
1493
1494		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1495			continue;
1496
1497		zone->prev_priority = zone->temp_priority;
1498	}
1499	return ret;
1500}
1501
1502/*
1503 * For kswapd, balance_pgdat() will work across all this node's zones until
1504 * they are all at pages_high.
1505 *
1506 * If `nr_pages' is non-zero then it is the number of pages which are to be
1507 * reclaimed, regardless of the zone occupancies.  This is a software suspend
1508 * special.
1509 *
1510 * Returns the number of pages which were actually freed.
1511 *
1512 * There is special handling here for zones which are full of pinned pages.
1513 * This can happen if the pages are all mlocked, or if they are all used by
1514 * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
1515 * What we do is to detect the case where all pages in the zone have been
1516 * scanned twice and there has been zero successful reclaim.  Mark the zone as
1517 * dead and from now on, only perform a short scan.  Basically we're polling
1518 * the zone for when the problem goes away.
1519 *
1520 * kswapd scans the zones in the highmem->normal->dma direction.  It skips
1521 * zones which have free_pages > pages_high, but once a zone is found to have
1522 * free_pages <= pages_high, we scan that zone and the lower zones regardless
1523 * of the number of free pages in the lower zones.  This interoperates with
1524 * the page allocator fallback scheme to ensure that aging of pages is balanced
1525 * across the zones.
1526 */
1527static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order)
1528{
1529	int to_free = nr_pages;
1530	int all_zones_ok;
1531	int priority;
1532	int i;
1533	int total_scanned, total_reclaimed;
1534	struct reclaim_state *reclaim_state = current->reclaim_state;
1535	struct scan_control sc;
1536
1537loop_again:
1538	total_scanned = 0;
1539	total_reclaimed = 0;
1540	sc.gfp_mask = GFP_KERNEL;
1541	sc.may_writepage = !laptop_mode;
1542	sc.may_swap = 1;
1543	sc.nr_mapped = read_page_state(nr_mapped);
1544
1545	inc_page_state(pageoutrun);
1546
1547	for (i = 0; i < pgdat->nr_zones; i++) {
1548		struct zone *zone = pgdat->node_zones + i;
1549
1550		zone->temp_priority = DEF_PRIORITY;
1551	}
1552
1553	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1554		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
1555		unsigned long lru_pages = 0;
1556
1557		/* The swap token gets in the way of swapout... */
1558		if (!priority)
1559			disable_swap_token();
1560
1561		all_zones_ok = 1;
1562
1563		if (nr_pages == 0) {
1564			/*
1565			 * Scan in the highmem->dma direction for the highest
1566			 * zone which needs scanning
1567			 */
1568			for (i = pgdat->nr_zones - 1; i >= 0; i--) {
1569				struct zone *zone = pgdat->node_zones + i;
1570
1571				if (!populated_zone(zone))
1572					continue;
1573
1574				if (zone->all_unreclaimable &&
1575						priority != DEF_PRIORITY)
1576					continue;
1577
1578				if (!zone_watermark_ok(zone, order,
1579						zone->pages_high, 0, 0)) {
1580					end_zone = i;
1581					goto scan;
1582				}
1583			}
1584			goto out;
1585		} else {
1586			end_zone = pgdat->nr_zones - 1;
1587		}
1588scan:
1589		for (i = 0; i <= end_zone; i++) {
1590			struct zone *zone = pgdat->node_zones + i;
1591
1592			lru_pages += zone->nr_active + zone->nr_inactive;
1593		}
1594
1595		/*
1596		 * Now scan the zone in the dma->highmem direction, stopping
1597		 * at the last zone which needs scanning.
1598		 *
1599		 * We do this because the page allocator works in the opposite
1600		 * direction.  This prevents the page allocator from allocating
1601		 * pages behind kswapd's direction of progress, which would
1602		 * cause too much scanning of the lower zones.
1603		 */
1604		for (i = 0; i <= end_zone; i++) {
1605			struct zone *zone = pgdat->node_zones + i;
1606			int nr_slab;
1607
1608			if (!populated_zone(zone))
1609				continue;
1610
1611			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1612				continue;
1613
1614			if (nr_pages == 0) {	/* Not software suspend */
1615				if (!zone_watermark_ok(zone, order,
1616						zone->pages_high, end_zone, 0))
1617					all_zones_ok = 0;
1618			}
1619			zone->temp_priority = priority;
1620			if (zone->prev_priority > priority)
1621				zone->prev_priority = priority;
1622			sc.nr_scanned = 0;
1623			sc.nr_reclaimed = 0;
1624			sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
1625			shrink_zone(priority, zone, &sc);
1626			reclaim_state->reclaimed_slab = 0;
1627			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1628						lru_pages);
1629			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
1630			total_reclaimed += sc.nr_reclaimed;
1631			total_scanned += sc.nr_scanned;
1632			if (zone->all_unreclaimable)
1633				continue;
1634			if (nr_slab == 0 && zone->pages_scanned >=
1635				    (zone->nr_active + zone->nr_inactive) * 4)
1636				zone->all_unreclaimable = 1;
1637			/*
1638			 * If we've done a decent amount of scanning and
1639			 * the reclaim ratio is low, start doing writepage
1640			 * even in laptop mode
1641			 */
1642			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
1643			    total_scanned > total_reclaimed+total_reclaimed/2)
1644				sc.may_writepage = 1;
1645		}
1646		if (nr_pages && to_free > total_reclaimed)
1647			continue;	/* swsusp: need to do more work */
1648		if (all_zones_ok)
1649			break;		/* kswapd: all done */
1650		/*
1651		 * OK, kswapd is getting into trouble.  Take a nap, then take
1652		 * another pass across the zones.
1653		 */
1654		if (total_scanned && priority < DEF_PRIORITY - 2)
1655			blk_congestion_wait(WRITE, HZ/10);
1656
1657		/*
1658		 * We do this so kswapd doesn't build up large priorities for
1659		 * example when it is freeing in parallel with allocators. It
1660		 * matches the direct reclaim path behaviour in terms of impact
1661		 * on zone->*_priority.
1662		 */
1663		if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages))
1664			break;
1665	}
1666out:
1667	for (i = 0; i < pgdat->nr_zones; i++) {
1668		struct zone *zone = pgdat->node_zones + i;
1669
1670		zone->prev_priority = zone->temp_priority;
1671	}
1672	if (!all_zones_ok) {
1673		cond_resched();
1674		goto loop_again;
1675	}
1676
1677	return total_reclaimed;
1678}
1679
1680/*
1681 * The background pageout daemon, started as a kernel thread
1682 * from the init process.
1683 *
1684 * This basically trickles out pages so that we have _some_
1685 * free memory available even if there is no other activity
1686 * that frees anything up. This is needed for things like routing
1687 * etc, where we otherwise might have all activity going on in
1688 * asynchronous contexts that cannot page things out.
1689 *
1690 * If there are applications that are active memory-allocators
1691 * (most normal use), this basically shouldn't matter.
1692 */
1693static int kswapd(void *p)
1694{
1695	unsigned long order;
1696	pg_data_t *pgdat = (pg_data_t*)p;
1697	struct task_struct *tsk = current;
1698	DEFINE_WAIT(wait);
1699	struct reclaim_state reclaim_state = {
1700		.reclaimed_slab = 0,
1701	};
1702	cpumask_t cpumask;
1703
1704	daemonize("kswapd%d", pgdat->node_id);
1705	cpumask = node_to_cpumask(pgdat->node_id);
1706	if (!cpus_empty(cpumask))
1707		set_cpus_allowed(tsk, cpumask);
1708	current->reclaim_state = &reclaim_state;
1709
1710	/*
1711	 * Tell the memory management that we're a "memory allocator",
1712	 * and that if we need more memory we should get access to it
1713	 * regardless (see "__alloc_pages()"). "kswapd" should
1714	 * never get caught in the normal page freeing logic.
1715	 *
1716	 * (Kswapd normally doesn't need memory anyway, but sometimes
1717	 * you need a small amount of memory in order to be able to
1718	 * page out something else, and this flag essentially protects
1719	 * us from recursively trying to free more memory as we're
1720	 * trying to free the first piece of memory in the first place).
1721	 */
1722	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
1723
1724	order = 0;
1725	for ( ; ; ) {
1726		unsigned long new_order;
1727
1728		try_to_freeze();
1729
1730		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
1731		new_order = pgdat->kswapd_max_order;
1732		pgdat->kswapd_max_order = 0;
1733		if (order < new_order) {
1734			/*
1735			 * Don't sleep if someone wants a larger 'order'
1736			 * allocation
1737			 */
1738			order = new_order;
1739		} else {
1740			schedule();
1741			order = pgdat->kswapd_max_order;
1742		}
1743		finish_wait(&pgdat->kswapd_wait, &wait);
1744
1745		balance_pgdat(pgdat, 0, order);
1746	}
1747	return 0;
1748}
1749
1750/*
1751 * A zone is low on free memory, so wake its kswapd task to service it.
1752 */
1753void wakeup_kswapd(struct zone *zone, int order)
1754{
1755	pg_data_t *pgdat;
1756
1757	if (!populated_zone(zone))
1758		return;
1759
1760	pgdat = zone->zone_pgdat;
1761	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
1762		return;
1763	if (pgdat->kswapd_max_order < order)
1764		pgdat->kswapd_max_order = order;
1765	if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1766		return;
1767	if (!waitqueue_active(&pgdat->kswapd_wait))
1768		return;
1769	wake_up_interruptible(&pgdat->kswapd_wait);
1770}
1771
1772#ifdef CONFIG_PM
1773/*
1774 * Try to free `nr_pages' of memory, system-wide.  Returns the number of freed
1775 * pages.
1776 */
1777int shrink_all_memory(int nr_pages)
1778{
1779	pg_data_t *pgdat;
1780	int nr_to_free = nr_pages;
1781	int ret = 0;
1782	struct reclaim_state reclaim_state = {
1783		.reclaimed_slab = 0,
1784	};
1785
1786	current->reclaim_state = &reclaim_state;
1787	for_each_pgdat(pgdat) {
1788		int freed;
1789		freed = balance_pgdat(pgdat, nr_to_free, 0);
1790		ret += freed;
1791		nr_to_free -= freed;
1792		if (nr_to_free <= 0)
1793			break;
1794	}
1795	current->reclaim_state = NULL;
1796	return ret;
1797}
1798#endif
1799
1800#ifdef CONFIG_HOTPLUG_CPU
1801/* It's optimal to keep kswapds on the same CPUs as their memory, but
1802   not required for correctness.  So if the last cpu in a node goes
1803   away, we get changed to run anywhere: as the first one comes back,
1804   restore their cpu bindings. */
1805static int __devinit cpu_callback(struct notifier_block *nfb,
1806				  unsigned long action,
1807				  void *hcpu)
1808{
1809	pg_data_t *pgdat;
1810	cpumask_t mask;
1811
1812	if (action == CPU_ONLINE) {
1813		for_each_pgdat(pgdat) {
1814			mask = node_to_cpumask(pgdat->node_id);
1815			if (any_online_cpu(mask) != NR_CPUS)
1816				/* One of our CPUs online: restore mask */
1817				set_cpus_allowed(pgdat->kswapd, mask);
1818		}
1819	}
1820	return NOTIFY_OK;
1821}
1822#endif /* CONFIG_HOTPLUG_CPU */
1823
1824static int __init kswapd_init(void)
1825{
1826	pg_data_t *pgdat;
1827	swap_setup();
1828	for_each_pgdat(pgdat)
1829		pgdat->kswapd
1830		= find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
1831	total_memory = nr_free_pagecache_pages();
1832	hotcpu_notifier(cpu_callback, 0);
1833	return 0;
1834}
1835
1836module_init(kswapd_init)
1837
1838#ifdef CONFIG_NUMA
1839/*
1840 * Zone reclaim mode
1841 *
1842 * If non-zero call zone_reclaim when the number of free pages falls below
1843 * the watermarks.
1844 *
1845 * In the future we may add flags to the mode. However, the page allocator
1846 * should only have to check that zone_reclaim_mode != 0 before calling
1847 * zone_reclaim().
1848 */
1849int zone_reclaim_mode __read_mostly;
1850
1851#define RECLAIM_OFF 0
1852#define RECLAIM_ZONE (1<<0)	/* Run shrink_cache on the zone */
1853#define RECLAIM_WRITE (1<<1)	/* Writeout pages during reclaim */
1854#define RECLAIM_SWAP (1<<2)	/* Swap pages out during reclaim */
1855#define RECLAIM_SLAB (1<<3)	/* Do a global slab shrink if the zone is out of memory */
1856
1857/*
1858 * Mininum time between zone reclaim scans
1859 */
1860int zone_reclaim_interval __read_mostly = 30*HZ;
1861
1862/*
1863 * Priority for ZONE_RECLAIM. This determines the fraction of pages
1864 * of a node considered for each zone_reclaim. 4 scans 1/16th of
1865 * a zone.
1866 */
1867#define ZONE_RECLAIM_PRIORITY 4
1868
1869/*
1870 * Try to free up some pages from this zone through reclaim.
1871 */
1872int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1873{
1874	int nr_pages;
1875	struct task_struct *p = current;
1876	struct reclaim_state reclaim_state;
1877	struct scan_control sc;
1878	cpumask_t mask;
1879	int node_id;
1880	int priority;
1881
1882	if (time_before(jiffies,
1883		zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
1884			return 0;
1885
1886	if (!(gfp_mask & __GFP_WAIT) ||
1887		zone->all_unreclaimable ||
1888		atomic_read(&zone->reclaim_in_progress) > 0 ||
1889		(p->flags & PF_MEMALLOC))
1890			return 0;
1891
1892	node_id = zone->zone_pgdat->node_id;
1893	mask = node_to_cpumask(node_id);
1894	if (!cpus_empty(mask) && node_id != numa_node_id())
1895		return 0;
1896
1897	sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
1898	sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
1899	sc.nr_scanned = 0;
1900	sc.nr_reclaimed = 0;
1901	sc.nr_mapped = read_page_state(nr_mapped);
1902	sc.gfp_mask = gfp_mask;
1903
1904	disable_swap_token();
1905
1906	nr_pages = 1 << order;
1907	if (nr_pages > SWAP_CLUSTER_MAX)
1908		sc.swap_cluster_max = nr_pages;
1909	else
1910		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
1911
1912	cond_resched();
1913	/*
1914	 * We need to be able to allocate from the reserves for RECLAIM_SWAP
1915	 * and we also need to be able to write out pages for RECLAIM_WRITE
1916	 * and RECLAIM_SWAP.
1917	 */
1918	p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
1919	reclaim_state.reclaimed_slab = 0;
1920	p->reclaim_state = &reclaim_state;
1921
1922	/*
1923	 * Free memory by calling shrink zone with increasing priorities
1924	 * until we have enough memory freed.
1925	 */
1926	priority = ZONE_RECLAIM_PRIORITY;
1927	do {
1928		shrink_zone(priority, zone, &sc);
1929		priority--;
1930	} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
1931
1932	if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
1933		/*
1934		 * shrink_slab does not currently allow us to determine
1935		 * how many pages were freed in the zone. So we just
1936		 * shake the slab and then go offnode for a single allocation.
1937		 *
1938		 * shrink_slab will free memory on all zones and may take
1939		 * a long time.
1940		 */
1941		shrink_slab(sc.nr_scanned, gfp_mask, order);
1942	}
1943
1944	p->reclaim_state = NULL;
1945	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
1946
1947	if (sc.nr_reclaimed == 0)
1948		zone->last_unsuccessful_zone_reclaim = jiffies;
1949
1950	return sc.nr_reclaimed >= nr_pages;
1951}
1952#endif
1953
1954