vmscan.c revision cc59850ef940e4ee6a765d28b439b9bafe07cf63
1/*
2 *  linux/mm/vmscan.c
3 *
4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
5 *
6 *  Swap reorganised 29.12.95, Stephen Tweedie.
7 *  kswapd added: 7.1.96  sct
8 *  Removed kswapd_ctl limits, and swap out as many pages as needed
9 *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
11 *  Multiqueue VM started 5.8.00, Rik van Riel.
12 */
13
14#include <linux/mm.h>
15#include <linux/module.h>
16#include <linux/gfp.h>
17#include <linux/kernel_stat.h>
18#include <linux/swap.h>
19#include <linux/pagemap.h>
20#include <linux/init.h>
21#include <linux/highmem.h>
22#include <linux/vmstat.h>
23#include <linux/file.h>
24#include <linux/writeback.h>
25#include <linux/blkdev.h>
26#include <linux/buffer_head.h>	/* for try_to_release_page(),
27					buffer_heads_over_limit */
28#include <linux/mm_inline.h>
29#include <linux/pagevec.h>
30#include <linux/backing-dev.h>
31#include <linux/rmap.h>
32#include <linux/topology.h>
33#include <linux/cpu.h>
34#include <linux/cpuset.h>
35#include <linux/compaction.h>
36#include <linux/notifier.h>
37#include <linux/rwsem.h>
38#include <linux/delay.h>
39#include <linux/kthread.h>
40#include <linux/freezer.h>
41#include <linux/memcontrol.h>
42#include <linux/delayacct.h>
43#include <linux/sysctl.h>
44#include <linux/oom.h>
45#include <linux/prefetch.h>
46
47#include <asm/tlbflush.h>
48#include <asm/div64.h>
49
50#include <linux/swapops.h>
51
52#include "internal.h"
53
54#define CREATE_TRACE_POINTS
55#include <trace/events/vmscan.h>
56
57/*
58 * reclaim_mode determines how the inactive list is shrunk
59 * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
60 * RECLAIM_MODE_ASYNC:  Do not block
61 * RECLAIM_MODE_SYNC:   Allow blocking e.g. call wait_on_page_writeback
62 * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
63 *			page from the LRU and reclaim all pages within a
64 *			naturally aligned range
65 * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
66 *			order-0 pages and then compact the zone
67 */
68typedef unsigned __bitwise__ reclaim_mode_t;
69#define RECLAIM_MODE_SINGLE		((__force reclaim_mode_t)0x01u)
70#define RECLAIM_MODE_ASYNC		((__force reclaim_mode_t)0x02u)
71#define RECLAIM_MODE_SYNC		((__force reclaim_mode_t)0x04u)
72#define RECLAIM_MODE_LUMPYRECLAIM	((__force reclaim_mode_t)0x08u)
73#define RECLAIM_MODE_COMPACTION		((__force reclaim_mode_t)0x10u)
74
75struct scan_control {
76	/* Incremented by the number of inactive pages that were scanned */
77	unsigned long nr_scanned;
78
79	/* Number of pages freed so far during a call to shrink_zones() */
80	unsigned long nr_reclaimed;
81
82	/* How many pages shrink_list() should reclaim */
83	unsigned long nr_to_reclaim;
84
85	unsigned long hibernation_mode;
86
87	/* This context's GFP mask */
88	gfp_t gfp_mask;
89
90	int may_writepage;
91
92	/* Can mapped pages be reclaimed? */
93	int may_unmap;
94
95	/* Can pages be swapped as part of reclaim? */
96	int may_swap;
97
98	int order;
99
100	/*
101	 * Intend to reclaim enough continuous memory rather than reclaim
102	 * enough amount of memory. i.e, mode for high order allocation.
103	 */
104	reclaim_mode_t reclaim_mode;
105
106	/* Which cgroup do we reclaim from */
107	struct mem_cgroup *mem_cgroup;
108
109	/*
110	 * Nodemask of nodes allowed by the caller. If NULL, all nodes
111	 * are scanned.
112	 */
113	nodemask_t	*nodemask;
114};
115
116#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
117
118#ifdef ARCH_HAS_PREFETCH
119#define prefetch_prev_lru_page(_page, _base, _field)			\
120	do {								\
121		if ((_page)->lru.prev != _base) {			\
122			struct page *prev;				\
123									\
124			prev = lru_to_page(&(_page->lru));		\
125			prefetch(&prev->_field);			\
126		}							\
127	} while (0)
128#else
129#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
130#endif
131
132#ifdef ARCH_HAS_PREFETCHW
133#define prefetchw_prev_lru_page(_page, _base, _field)			\
134	do {								\
135		if ((_page)->lru.prev != _base) {			\
136			struct page *prev;				\
137									\
138			prev = lru_to_page(&(_page->lru));		\
139			prefetchw(&prev->_field);			\
140		}							\
141	} while (0)
142#else
143#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
144#endif
145
146/*
147 * From 0 .. 100.  Higher means more swappy.
148 */
149int vm_swappiness = 60;
150long vm_total_pages;	/* The total number of pages which the VM controls */
151
152static LIST_HEAD(shrinker_list);
153static DECLARE_RWSEM(shrinker_rwsem);
154
155#ifdef CONFIG_CGROUP_MEM_RES_CTLR
156#define scanning_global_lru(sc)	(!(sc)->mem_cgroup)
157#else
158#define scanning_global_lru(sc)	(1)
159#endif
160
161static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
162						  struct scan_control *sc)
163{
164	if (!scanning_global_lru(sc))
165		return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone);
166
167	return &zone->reclaim_stat;
168}
169
170static unsigned long zone_nr_lru_pages(struct zone *zone,
171				struct scan_control *sc, enum lru_list lru)
172{
173	if (!scanning_global_lru(sc))
174		return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup,
175				zone_to_nid(zone), zone_idx(zone), BIT(lru));
176
177	return zone_page_state(zone, NR_LRU_BASE + lru);
178}
179
180
181/*
182 * Add a shrinker callback to be called from the vm
183 */
184void register_shrinker(struct shrinker *shrinker)
185{
186	atomic_long_set(&shrinker->nr_in_batch, 0);
187	down_write(&shrinker_rwsem);
188	list_add_tail(&shrinker->list, &shrinker_list);
189	up_write(&shrinker_rwsem);
190}
191EXPORT_SYMBOL(register_shrinker);
192
193/*
194 * Remove one
195 */
196void unregister_shrinker(struct shrinker *shrinker)
197{
198	down_write(&shrinker_rwsem);
199	list_del(&shrinker->list);
200	up_write(&shrinker_rwsem);
201}
202EXPORT_SYMBOL(unregister_shrinker);
203
204static inline int do_shrinker_shrink(struct shrinker *shrinker,
205				     struct shrink_control *sc,
206				     unsigned long nr_to_scan)
207{
208	sc->nr_to_scan = nr_to_scan;
209	return (*shrinker->shrink)(shrinker, sc);
210}
211
212#define SHRINK_BATCH 128
213/*
214 * Call the shrink functions to age shrinkable caches
215 *
216 * Here we assume it costs one seek to replace a lru page and that it also
217 * takes a seek to recreate a cache object.  With this in mind we age equal
218 * percentages of the lru and ageable caches.  This should balance the seeks
219 * generated by these structures.
220 *
221 * If the vm encountered mapped pages on the LRU it increase the pressure on
222 * slab to avoid swapping.
223 *
224 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
225 *
226 * `lru_pages' represents the number of on-LRU pages in all the zones which
227 * are eligible for the caller's allocation attempt.  It is used for balancing
228 * slab reclaim versus page reclaim.
229 *
230 * Returns the number of slab objects which we shrunk.
231 */
232unsigned long shrink_slab(struct shrink_control *shrink,
233			  unsigned long nr_pages_scanned,
234			  unsigned long lru_pages)
235{
236	struct shrinker *shrinker;
237	unsigned long ret = 0;
238
239	if (nr_pages_scanned == 0)
240		nr_pages_scanned = SWAP_CLUSTER_MAX;
241
242	if (!down_read_trylock(&shrinker_rwsem)) {
243		/* Assume we'll be able to shrink next time */
244		ret = 1;
245		goto out;
246	}
247
248	list_for_each_entry(shrinker, &shrinker_list, list) {
249		unsigned long long delta;
250		long total_scan;
251		long max_pass;
252		int shrink_ret = 0;
253		long nr;
254		long new_nr;
255		long batch_size = shrinker->batch ? shrinker->batch
256						  : SHRINK_BATCH;
257
258		max_pass = do_shrinker_shrink(shrinker, shrink, 0);
259		if (max_pass <= 0)
260			continue;
261
262		/*
263		 * copy the current shrinker scan count into a local variable
264		 * and zero it so that other concurrent shrinker invocations
265		 * don't also do this scanning work.
266		 */
267		nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
268
269		total_scan = nr;
270		delta = (4 * nr_pages_scanned) / shrinker->seeks;
271		delta *= max_pass;
272		do_div(delta, lru_pages + 1);
273		total_scan += delta;
274		if (total_scan < 0) {
275			printk(KERN_ERR "shrink_slab: %pF negative objects to "
276			       "delete nr=%ld\n",
277			       shrinker->shrink, total_scan);
278			total_scan = max_pass;
279		}
280
281		/*
282		 * We need to avoid excessive windup on filesystem shrinkers
283		 * due to large numbers of GFP_NOFS allocations causing the
284		 * shrinkers to return -1 all the time. This results in a large
285		 * nr being built up so when a shrink that can do some work
286		 * comes along it empties the entire cache due to nr >>>
287		 * max_pass.  This is bad for sustaining a working set in
288		 * memory.
289		 *
290		 * Hence only allow the shrinker to scan the entire cache when
291		 * a large delta change is calculated directly.
292		 */
293		if (delta < max_pass / 4)
294			total_scan = min(total_scan, max_pass / 2);
295
296		/*
297		 * Avoid risking looping forever due to too large nr value:
298		 * never try to free more than twice the estimate number of
299		 * freeable entries.
300		 */
301		if (total_scan > max_pass * 2)
302			total_scan = max_pass * 2;
303
304		trace_mm_shrink_slab_start(shrinker, shrink, nr,
305					nr_pages_scanned, lru_pages,
306					max_pass, delta, total_scan);
307
308		while (total_scan >= batch_size) {
309			int nr_before;
310
311			nr_before = do_shrinker_shrink(shrinker, shrink, 0);
312			shrink_ret = do_shrinker_shrink(shrinker, shrink,
313							batch_size);
314			if (shrink_ret == -1)
315				break;
316			if (shrink_ret < nr_before)
317				ret += nr_before - shrink_ret;
318			count_vm_events(SLABS_SCANNED, batch_size);
319			total_scan -= batch_size;
320
321			cond_resched();
322		}
323
324		/*
325		 * move the unused scan count back into the shrinker in a
326		 * manner that handles concurrent updates. If we exhausted the
327		 * scan, there is no need to do an update.
328		 */
329		if (total_scan > 0)
330			new_nr = atomic_long_add_return(total_scan,
331					&shrinker->nr_in_batch);
332		else
333			new_nr = atomic_long_read(&shrinker->nr_in_batch);
334
335		trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
336	}
337	up_read(&shrinker_rwsem);
338out:
339	cond_resched();
340	return ret;
341}
342
343static void set_reclaim_mode(int priority, struct scan_control *sc,
344				   bool sync)
345{
346	reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
347
348	/*
349	 * Initially assume we are entering either lumpy reclaim or
350	 * reclaim/compaction.Depending on the order, we will either set the
351	 * sync mode or just reclaim order-0 pages later.
352	 */
353	if (COMPACTION_BUILD)
354		sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
355	else
356		sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
357
358	/*
359	 * Avoid using lumpy reclaim or reclaim/compaction if possible by
360	 * restricting when its set to either costly allocations or when
361	 * under memory pressure
362	 */
363	if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
364		sc->reclaim_mode |= syncmode;
365	else if (sc->order && priority < DEF_PRIORITY - 2)
366		sc->reclaim_mode |= syncmode;
367	else
368		sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
369}
370
371static void reset_reclaim_mode(struct scan_control *sc)
372{
373	sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
374}
375
376static inline int is_page_cache_freeable(struct page *page)
377{
378	/*
379	 * A freeable page cache page is referenced only by the caller
380	 * that isolated the page, the page cache radix tree and
381	 * optional buffer heads at page->private.
382	 */
383	return page_count(page) - page_has_private(page) == 2;
384}
385
386static int may_write_to_queue(struct backing_dev_info *bdi,
387			      struct scan_control *sc)
388{
389	if (current->flags & PF_SWAPWRITE)
390		return 1;
391	if (!bdi_write_congested(bdi))
392		return 1;
393	if (bdi == current->backing_dev_info)
394		return 1;
395
396	/* lumpy reclaim for hugepage often need a lot of write */
397	if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
398		return 1;
399	return 0;
400}
401
402/*
403 * We detected a synchronous write error writing a page out.  Probably
404 * -ENOSPC.  We need to propagate that into the address_space for a subsequent
405 * fsync(), msync() or close().
406 *
407 * The tricky part is that after writepage we cannot touch the mapping: nothing
408 * prevents it from being freed up.  But we have a ref on the page and once
409 * that page is locked, the mapping is pinned.
410 *
411 * We're allowed to run sleeping lock_page() here because we know the caller has
412 * __GFP_FS.
413 */
414static void handle_write_error(struct address_space *mapping,
415				struct page *page, int error)
416{
417	lock_page(page);
418	if (page_mapping(page) == mapping)
419		mapping_set_error(mapping, error);
420	unlock_page(page);
421}
422
423/* possible outcome of pageout() */
424typedef enum {
425	/* failed to write page out, page is locked */
426	PAGE_KEEP,
427	/* move page to the active list, page is locked */
428	PAGE_ACTIVATE,
429	/* page has been sent to the disk successfully, page is unlocked */
430	PAGE_SUCCESS,
431	/* page is clean and locked */
432	PAGE_CLEAN,
433} pageout_t;
434
435/*
436 * pageout is called by shrink_page_list() for each dirty page.
437 * Calls ->writepage().
438 */
439static pageout_t pageout(struct page *page, struct address_space *mapping,
440			 struct scan_control *sc)
441{
442	/*
443	 * If the page is dirty, only perform writeback if that write
444	 * will be non-blocking.  To prevent this allocation from being
445	 * stalled by pagecache activity.  But note that there may be
446	 * stalls if we need to run get_block().  We could test
447	 * PagePrivate for that.
448	 *
449	 * If this process is currently in __generic_file_aio_write() against
450	 * this page's queue, we can perform writeback even if that
451	 * will block.
452	 *
453	 * If the page is swapcache, write it back even if that would
454	 * block, for some throttling. This happens by accident, because
455	 * swap_backing_dev_info is bust: it doesn't reflect the
456	 * congestion state of the swapdevs.  Easy to fix, if needed.
457	 */
458	if (!is_page_cache_freeable(page))
459		return PAGE_KEEP;
460	if (!mapping) {
461		/*
462		 * Some data journaling orphaned pages can have
463		 * page->mapping == NULL while being dirty with clean buffers.
464		 */
465		if (page_has_private(page)) {
466			if (try_to_free_buffers(page)) {
467				ClearPageDirty(page);
468				printk("%s: orphaned page\n", __func__);
469				return PAGE_CLEAN;
470			}
471		}
472		return PAGE_KEEP;
473	}
474	if (mapping->a_ops->writepage == NULL)
475		return PAGE_ACTIVATE;
476	if (!may_write_to_queue(mapping->backing_dev_info, sc))
477		return PAGE_KEEP;
478
479	if (clear_page_dirty_for_io(page)) {
480		int res;
481		struct writeback_control wbc = {
482			.sync_mode = WB_SYNC_NONE,
483			.nr_to_write = SWAP_CLUSTER_MAX,
484			.range_start = 0,
485			.range_end = LLONG_MAX,
486			.for_reclaim = 1,
487		};
488
489		SetPageReclaim(page);
490		res = mapping->a_ops->writepage(page, &wbc);
491		if (res < 0)
492			handle_write_error(mapping, page, res);
493		if (res == AOP_WRITEPAGE_ACTIVATE) {
494			ClearPageReclaim(page);
495			return PAGE_ACTIVATE;
496		}
497
498		if (!PageWriteback(page)) {
499			/* synchronous write or broken a_ops? */
500			ClearPageReclaim(page);
501		}
502		trace_mm_vmscan_writepage(page,
503			trace_reclaim_flags(page, sc->reclaim_mode));
504		inc_zone_page_state(page, NR_VMSCAN_WRITE);
505		return PAGE_SUCCESS;
506	}
507
508	return PAGE_CLEAN;
509}
510
511/*
512 * Same as remove_mapping, but if the page is removed from the mapping, it
513 * gets returned with a refcount of 0.
514 */
515static int __remove_mapping(struct address_space *mapping, struct page *page)
516{
517	BUG_ON(!PageLocked(page));
518	BUG_ON(mapping != page_mapping(page));
519
520	spin_lock_irq(&mapping->tree_lock);
521	/*
522	 * The non racy check for a busy page.
523	 *
524	 * Must be careful with the order of the tests. When someone has
525	 * a ref to the page, it may be possible that they dirty it then
526	 * drop the reference. So if PageDirty is tested before page_count
527	 * here, then the following race may occur:
528	 *
529	 * get_user_pages(&page);
530	 * [user mapping goes away]
531	 * write_to(page);
532	 *				!PageDirty(page)    [good]
533	 * SetPageDirty(page);
534	 * put_page(page);
535	 *				!page_count(page)   [good, discard it]
536	 *
537	 * [oops, our write_to data is lost]
538	 *
539	 * Reversing the order of the tests ensures such a situation cannot
540	 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
541	 * load is not satisfied before that of page->_count.
542	 *
543	 * Note that if SetPageDirty is always performed via set_page_dirty,
544	 * and thus under tree_lock, then this ordering is not required.
545	 */
546	if (!page_freeze_refs(page, 2))
547		goto cannot_free;
548	/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
549	if (unlikely(PageDirty(page))) {
550		page_unfreeze_refs(page, 2);
551		goto cannot_free;
552	}
553
554	if (PageSwapCache(page)) {
555		swp_entry_t swap = { .val = page_private(page) };
556		__delete_from_swap_cache(page);
557		spin_unlock_irq(&mapping->tree_lock);
558		swapcache_free(swap, page);
559	} else {
560		void (*freepage)(struct page *);
561
562		freepage = mapping->a_ops->freepage;
563
564		__delete_from_page_cache(page);
565		spin_unlock_irq(&mapping->tree_lock);
566		mem_cgroup_uncharge_cache_page(page);
567
568		if (freepage != NULL)
569			freepage(page);
570	}
571
572	return 1;
573
574cannot_free:
575	spin_unlock_irq(&mapping->tree_lock);
576	return 0;
577}
578
579/*
580 * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
581 * someone else has a ref on the page, abort and return 0.  If it was
582 * successfully detached, return 1.  Assumes the caller has a single ref on
583 * this page.
584 */
585int remove_mapping(struct address_space *mapping, struct page *page)
586{
587	if (__remove_mapping(mapping, page)) {
588		/*
589		 * Unfreezing the refcount with 1 rather than 2 effectively
590		 * drops the pagecache ref for us without requiring another
591		 * atomic operation.
592		 */
593		page_unfreeze_refs(page, 1);
594		return 1;
595	}
596	return 0;
597}
598
599/**
600 * putback_lru_page - put previously isolated page onto appropriate LRU list
601 * @page: page to be put back to appropriate lru list
602 *
603 * Add previously isolated @page to appropriate LRU list.
604 * Page may still be unevictable for other reasons.
605 *
606 * lru_lock must not be held, interrupts must be enabled.
607 */
608void putback_lru_page(struct page *page)
609{
610	int lru;
611	int active = !!TestClearPageActive(page);
612	int was_unevictable = PageUnevictable(page);
613
614	VM_BUG_ON(PageLRU(page));
615
616redo:
617	ClearPageUnevictable(page);
618
619	if (page_evictable(page, NULL)) {
620		/*
621		 * For evictable pages, we can use the cache.
622		 * In event of a race, worst case is we end up with an
623		 * unevictable page on [in]active list.
624		 * We know how to handle that.
625		 */
626		lru = active + page_lru_base_type(page);
627		lru_cache_add_lru(page, lru);
628	} else {
629		/*
630		 * Put unevictable pages directly on zone's unevictable
631		 * list.
632		 */
633		lru = LRU_UNEVICTABLE;
634		add_page_to_unevictable_list(page);
635		/*
636		 * When racing with an mlock or AS_UNEVICTABLE clearing
637		 * (page is unlocked) make sure that if the other thread
638		 * does not observe our setting of PG_lru and fails
639		 * isolation/check_move_unevictable_page,
640		 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
641		 * the page back to the evictable list.
642		 *
643		 * The other side is TestClearPageMlocked() or shmem_lock().
644		 */
645		smp_mb();
646	}
647
648	/*
649	 * page's status can change while we move it among lru. If an evictable
650	 * page is on unevictable list, it never be freed. To avoid that,
651	 * check after we added it to the list, again.
652	 */
653	if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
654		if (!isolate_lru_page(page)) {
655			put_page(page);
656			goto redo;
657		}
658		/* This means someone else dropped this page from LRU
659		 * So, it will be freed or putback to LRU again. There is
660		 * nothing to do here.
661		 */
662	}
663
664	if (was_unevictable && lru != LRU_UNEVICTABLE)
665		count_vm_event(UNEVICTABLE_PGRESCUED);
666	else if (!was_unevictable && lru == LRU_UNEVICTABLE)
667		count_vm_event(UNEVICTABLE_PGCULLED);
668
669	put_page(page);		/* drop ref from isolate */
670}
671
672enum page_references {
673	PAGEREF_RECLAIM,
674	PAGEREF_RECLAIM_CLEAN,
675	PAGEREF_KEEP,
676	PAGEREF_ACTIVATE,
677};
678
679static enum page_references page_check_references(struct page *page,
680						  struct scan_control *sc)
681{
682	int referenced_ptes, referenced_page;
683	unsigned long vm_flags;
684
685	referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags);
686	referenced_page = TestClearPageReferenced(page);
687
688	/* Lumpy reclaim - ignore references */
689	if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
690		return PAGEREF_RECLAIM;
691
692	/*
693	 * Mlock lost the isolation race with us.  Let try_to_unmap()
694	 * move the page to the unevictable list.
695	 */
696	if (vm_flags & VM_LOCKED)
697		return PAGEREF_RECLAIM;
698
699	if (referenced_ptes) {
700		if (PageAnon(page))
701			return PAGEREF_ACTIVATE;
702		/*
703		 * All mapped pages start out with page table
704		 * references from the instantiating fault, so we need
705		 * to look twice if a mapped file page is used more
706		 * than once.
707		 *
708		 * Mark it and spare it for another trip around the
709		 * inactive list.  Another page table reference will
710		 * lead to its activation.
711		 *
712		 * Note: the mark is set for activated pages as well
713		 * so that recently deactivated but used pages are
714		 * quickly recovered.
715		 */
716		SetPageReferenced(page);
717
718		if (referenced_page || referenced_ptes > 1)
719			return PAGEREF_ACTIVATE;
720
721		/*
722		 * Activate file-backed executable pages after first usage.
723		 */
724		if (vm_flags & VM_EXEC)
725			return PAGEREF_ACTIVATE;
726
727		return PAGEREF_KEEP;
728	}
729
730	/* Reclaim if clean, defer dirty pages to writeback */
731	if (referenced_page && !PageSwapBacked(page))
732		return PAGEREF_RECLAIM_CLEAN;
733
734	return PAGEREF_RECLAIM;
735}
736
737/*
738 * shrink_page_list() returns the number of reclaimed pages
739 */
740static unsigned long shrink_page_list(struct list_head *page_list,
741				      struct zone *zone,
742				      struct scan_control *sc,
743				      int priority,
744				      unsigned long *ret_nr_dirty,
745				      unsigned long *ret_nr_writeback)
746{
747	LIST_HEAD(ret_pages);
748	LIST_HEAD(free_pages);
749	int pgactivate = 0;
750	unsigned long nr_dirty = 0;
751	unsigned long nr_congested = 0;
752	unsigned long nr_reclaimed = 0;
753	unsigned long nr_writeback = 0;
754
755	cond_resched();
756
757	while (!list_empty(page_list)) {
758		enum page_references references;
759		struct address_space *mapping;
760		struct page *page;
761		int may_enter_fs;
762
763		cond_resched();
764
765		page = lru_to_page(page_list);
766		list_del(&page->lru);
767
768		if (!trylock_page(page))
769			goto keep;
770
771		VM_BUG_ON(PageActive(page));
772		VM_BUG_ON(page_zone(page) != zone);
773
774		sc->nr_scanned++;
775
776		if (unlikely(!page_evictable(page, NULL)))
777			goto cull_mlocked;
778
779		if (!sc->may_unmap && page_mapped(page))
780			goto keep_locked;
781
782		/* Double the slab pressure for mapped and swapcache pages */
783		if (page_mapped(page) || PageSwapCache(page))
784			sc->nr_scanned++;
785
786		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
787			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
788
789		if (PageWriteback(page)) {
790			nr_writeback++;
791			/*
792			 * Synchronous reclaim cannot queue pages for
793			 * writeback due to the possibility of stack overflow
794			 * but if it encounters a page under writeback, wait
795			 * for the IO to complete.
796			 */
797			if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
798			    may_enter_fs)
799				wait_on_page_writeback(page);
800			else {
801				unlock_page(page);
802				goto keep_lumpy;
803			}
804		}
805
806		references = page_check_references(page, sc);
807		switch (references) {
808		case PAGEREF_ACTIVATE:
809			goto activate_locked;
810		case PAGEREF_KEEP:
811			goto keep_locked;
812		case PAGEREF_RECLAIM:
813		case PAGEREF_RECLAIM_CLEAN:
814			; /* try to reclaim the page below */
815		}
816
817		/*
818		 * Anonymous process memory has backing store?
819		 * Try to allocate it some swap space here.
820		 */
821		if (PageAnon(page) && !PageSwapCache(page)) {
822			if (!(sc->gfp_mask & __GFP_IO))
823				goto keep_locked;
824			if (!add_to_swap(page))
825				goto activate_locked;
826			may_enter_fs = 1;
827		}
828
829		mapping = page_mapping(page);
830
831		/*
832		 * The page is mapped into the page tables of one or more
833		 * processes. Try to unmap it here.
834		 */
835		if (page_mapped(page) && mapping) {
836			switch (try_to_unmap(page, TTU_UNMAP)) {
837			case SWAP_FAIL:
838				goto activate_locked;
839			case SWAP_AGAIN:
840				goto keep_locked;
841			case SWAP_MLOCK:
842				goto cull_mlocked;
843			case SWAP_SUCCESS:
844				; /* try to free the page below */
845			}
846		}
847
848		if (PageDirty(page)) {
849			nr_dirty++;
850
851			/*
852			 * Only kswapd can writeback filesystem pages to
853			 * avoid risk of stack overflow but do not writeback
854			 * unless under significant pressure.
855			 */
856			if (page_is_file_cache(page) &&
857					(!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
858				/*
859				 * Immediately reclaim when written back.
860				 * Similar in principal to deactivate_page()
861				 * except we already have the page isolated
862				 * and know it's dirty
863				 */
864				inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
865				SetPageReclaim(page);
866
867				goto keep_locked;
868			}
869
870			if (references == PAGEREF_RECLAIM_CLEAN)
871				goto keep_locked;
872			if (!may_enter_fs)
873				goto keep_locked;
874			if (!sc->may_writepage)
875				goto keep_locked;
876
877			/* Page is dirty, try to write it out here */
878			switch (pageout(page, mapping, sc)) {
879			case PAGE_KEEP:
880				nr_congested++;
881				goto keep_locked;
882			case PAGE_ACTIVATE:
883				goto activate_locked;
884			case PAGE_SUCCESS:
885				if (PageWriteback(page))
886					goto keep_lumpy;
887				if (PageDirty(page))
888					goto keep;
889
890				/*
891				 * A synchronous write - probably a ramdisk.  Go
892				 * ahead and try to reclaim the page.
893				 */
894				if (!trylock_page(page))
895					goto keep;
896				if (PageDirty(page) || PageWriteback(page))
897					goto keep_locked;
898				mapping = page_mapping(page);
899			case PAGE_CLEAN:
900				; /* try to free the page below */
901			}
902		}
903
904		/*
905		 * If the page has buffers, try to free the buffer mappings
906		 * associated with this page. If we succeed we try to free
907		 * the page as well.
908		 *
909		 * We do this even if the page is PageDirty().
910		 * try_to_release_page() does not perform I/O, but it is
911		 * possible for a page to have PageDirty set, but it is actually
912		 * clean (all its buffers are clean).  This happens if the
913		 * buffers were written out directly, with submit_bh(). ext3
914		 * will do this, as well as the blockdev mapping.
915		 * try_to_release_page() will discover that cleanness and will
916		 * drop the buffers and mark the page clean - it can be freed.
917		 *
918		 * Rarely, pages can have buffers and no ->mapping.  These are
919		 * the pages which were not successfully invalidated in
920		 * truncate_complete_page().  We try to drop those buffers here
921		 * and if that worked, and the page is no longer mapped into
922		 * process address space (page_count == 1) it can be freed.
923		 * Otherwise, leave the page on the LRU so it is swappable.
924		 */
925		if (page_has_private(page)) {
926			if (!try_to_release_page(page, sc->gfp_mask))
927				goto activate_locked;
928			if (!mapping && page_count(page) == 1) {
929				unlock_page(page);
930				if (put_page_testzero(page))
931					goto free_it;
932				else {
933					/*
934					 * rare race with speculative reference.
935					 * the speculative reference will free
936					 * this page shortly, so we may
937					 * increment nr_reclaimed here (and
938					 * leave it off the LRU).
939					 */
940					nr_reclaimed++;
941					continue;
942				}
943			}
944		}
945
946		if (!mapping || !__remove_mapping(mapping, page))
947			goto keep_locked;
948
949		/*
950		 * At this point, we have no other references and there is
951		 * no way to pick any more up (removed from LRU, removed
952		 * from pagecache). Can use non-atomic bitops now (and
953		 * we obviously don't have to worry about waking up a process
954		 * waiting on the page lock, because there are no references.
955		 */
956		__clear_page_locked(page);
957free_it:
958		nr_reclaimed++;
959
960		/*
961		 * Is there need to periodically free_page_list? It would
962		 * appear not as the counts should be low
963		 */
964		list_add(&page->lru, &free_pages);
965		continue;
966
967cull_mlocked:
968		if (PageSwapCache(page))
969			try_to_free_swap(page);
970		unlock_page(page);
971		putback_lru_page(page);
972		reset_reclaim_mode(sc);
973		continue;
974
975activate_locked:
976		/* Not a candidate for swapping, so reclaim swap space. */
977		if (PageSwapCache(page) && vm_swap_full())
978			try_to_free_swap(page);
979		VM_BUG_ON(PageActive(page));
980		SetPageActive(page);
981		pgactivate++;
982keep_locked:
983		unlock_page(page);
984keep:
985		reset_reclaim_mode(sc);
986keep_lumpy:
987		list_add(&page->lru, &ret_pages);
988		VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
989	}
990
991	/*
992	 * Tag a zone as congested if all the dirty pages encountered were
993	 * backed by a congested BDI. In this case, reclaimers should just
994	 * back off and wait for congestion to clear because further reclaim
995	 * will encounter the same problem
996	 */
997	if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc))
998		zone_set_flag(zone, ZONE_CONGESTED);
999
1000	free_hot_cold_page_list(&free_pages, 1);
1001
1002	list_splice(&ret_pages, page_list);
1003	count_vm_events(PGACTIVATE, pgactivate);
1004	*ret_nr_dirty += nr_dirty;
1005	*ret_nr_writeback += nr_writeback;
1006	return nr_reclaimed;
1007}
1008
1009/*
1010 * Attempt to remove the specified page from its LRU.  Only take this page
1011 * if it is of the appropriate PageActive status.  Pages which are being
1012 * freed elsewhere are also ignored.
1013 *
1014 * page:	page to consider
1015 * mode:	one of the LRU isolation modes defined above
1016 *
1017 * returns 0 on success, -ve errno on failure.
1018 */
1019int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1020{
1021	bool all_lru_mode;
1022	int ret = -EINVAL;
1023
1024	/* Only take pages on the LRU. */
1025	if (!PageLRU(page))
1026		return ret;
1027
1028	all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
1029		(ISOLATE_ACTIVE|ISOLATE_INACTIVE);
1030
1031	/*
1032	 * When checking the active state, we need to be sure we are
1033	 * dealing with comparible boolean values.  Take the logical not
1034	 * of each.
1035	 */
1036	if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
1037		return ret;
1038
1039	if (!all_lru_mode && !!page_is_file_cache(page) != file)
1040		return ret;
1041
1042	/*
1043	 * When this function is being called for lumpy reclaim, we
1044	 * initially look into all LRU pages, active, inactive and
1045	 * unevictable; only give shrink_page_list evictable pages.
1046	 */
1047	if (PageUnevictable(page))
1048		return ret;
1049
1050	ret = -EBUSY;
1051
1052	if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page)))
1053		return ret;
1054
1055	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1056		return ret;
1057
1058	if (likely(get_page_unless_zero(page))) {
1059		/*
1060		 * Be careful not to clear PageLRU until after we're
1061		 * sure the page is not being freed elsewhere -- the
1062		 * page release code relies on it.
1063		 */
1064		ClearPageLRU(page);
1065		ret = 0;
1066	}
1067
1068	return ret;
1069}
1070
1071/*
1072 * zone->lru_lock is heavily contended.  Some of the functions that
1073 * shrink the lists perform better by taking out a batch of pages
1074 * and working on them outside the LRU lock.
1075 *
1076 * For pagecache intensive workloads, this function is the hottest
1077 * spot in the kernel (apart from copy_*_user functions).
1078 *
1079 * Appropriate locks must be held before calling this function.
1080 *
1081 * @nr_to_scan:	The number of pages to look through on the list.
1082 * @src:	The LRU list to pull pages off.
1083 * @dst:	The temp list to put pages on to.
1084 * @scanned:	The number of pages that were scanned.
1085 * @order:	The caller's attempted allocation order
1086 * @mode:	One of the LRU isolation modes
1087 * @file:	True [1] if isolating file [!anon] pages
1088 *
1089 * returns how many pages were moved onto *@dst.
1090 */
1091static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1092		struct list_head *src, struct list_head *dst,
1093		unsigned long *scanned, int order, isolate_mode_t mode,
1094		int file)
1095{
1096	unsigned long nr_taken = 0;
1097	unsigned long nr_lumpy_taken = 0;
1098	unsigned long nr_lumpy_dirty = 0;
1099	unsigned long nr_lumpy_failed = 0;
1100	unsigned long scan;
1101
1102	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
1103		struct page *page;
1104		unsigned long pfn;
1105		unsigned long end_pfn;
1106		unsigned long page_pfn;
1107		int zone_id;
1108
1109		page = lru_to_page(src);
1110		prefetchw_prev_lru_page(page, src, flags);
1111
1112		VM_BUG_ON(!PageLRU(page));
1113
1114		switch (__isolate_lru_page(page, mode, file)) {
1115		case 0:
1116			list_move(&page->lru, dst);
1117			mem_cgroup_del_lru(page);
1118			nr_taken += hpage_nr_pages(page);
1119			break;
1120
1121		case -EBUSY:
1122			/* else it is being freed elsewhere */
1123			list_move(&page->lru, src);
1124			mem_cgroup_rotate_lru_list(page, page_lru(page));
1125			continue;
1126
1127		default:
1128			BUG();
1129		}
1130
1131		if (!order)
1132			continue;
1133
1134		/*
1135		 * Attempt to take all pages in the order aligned region
1136		 * surrounding the tag page.  Only take those pages of
1137		 * the same active state as that tag page.  We may safely
1138		 * round the target page pfn down to the requested order
1139		 * as the mem_map is guaranteed valid out to MAX_ORDER,
1140		 * where that page is in a different zone we will detect
1141		 * it from its zone id and abort this block scan.
1142		 */
1143		zone_id = page_zone_id(page);
1144		page_pfn = page_to_pfn(page);
1145		pfn = page_pfn & ~((1 << order) - 1);
1146		end_pfn = pfn + (1 << order);
1147		for (; pfn < end_pfn; pfn++) {
1148			struct page *cursor_page;
1149
1150			/* The target page is in the block, ignore it. */
1151			if (unlikely(pfn == page_pfn))
1152				continue;
1153
1154			/* Avoid holes within the zone. */
1155			if (unlikely(!pfn_valid_within(pfn)))
1156				break;
1157
1158			cursor_page = pfn_to_page(pfn);
1159
1160			/* Check that we have not crossed a zone boundary. */
1161			if (unlikely(page_zone_id(cursor_page) != zone_id))
1162				break;
1163
1164			/*
1165			 * If we don't have enough swap space, reclaiming of
1166			 * anon page which don't already have a swap slot is
1167			 * pointless.
1168			 */
1169			if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
1170			    !PageSwapCache(cursor_page))
1171				break;
1172
1173			if (__isolate_lru_page(cursor_page, mode, file) == 0) {
1174				list_move(&cursor_page->lru, dst);
1175				mem_cgroup_del_lru(cursor_page);
1176				nr_taken += hpage_nr_pages(page);
1177				nr_lumpy_taken++;
1178				if (PageDirty(cursor_page))
1179					nr_lumpy_dirty++;
1180				scan++;
1181			} else {
1182				/*
1183				 * Check if the page is freed already.
1184				 *
1185				 * We can't use page_count() as that
1186				 * requires compound_head and we don't
1187				 * have a pin on the page here. If a
1188				 * page is tail, we may or may not
1189				 * have isolated the head, so assume
1190				 * it's not free, it'd be tricky to
1191				 * track the head status without a
1192				 * page pin.
1193				 */
1194				if (!PageTail(cursor_page) &&
1195				    !atomic_read(&cursor_page->_count))
1196					continue;
1197				break;
1198			}
1199		}
1200
1201		/* If we break out of the loop above, lumpy reclaim failed */
1202		if (pfn < end_pfn)
1203			nr_lumpy_failed++;
1204	}
1205
1206	*scanned = scan;
1207
1208	trace_mm_vmscan_lru_isolate(order,
1209			nr_to_scan, scan,
1210			nr_taken,
1211			nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
1212			mode);
1213	return nr_taken;
1214}
1215
1216static unsigned long isolate_pages_global(unsigned long nr,
1217					struct list_head *dst,
1218					unsigned long *scanned, int order,
1219					isolate_mode_t mode,
1220					struct zone *z,	int active, int file)
1221{
1222	int lru = LRU_BASE;
1223	if (active)
1224		lru += LRU_ACTIVE;
1225	if (file)
1226		lru += LRU_FILE;
1227	return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
1228								mode, file);
1229}
1230
1231/*
1232 * clear_active_flags() is a helper for shrink_active_list(), clearing
1233 * any active bits from the pages in the list.
1234 */
1235static unsigned long clear_active_flags(struct list_head *page_list,
1236					unsigned int *count)
1237{
1238	int nr_active = 0;
1239	int lru;
1240	struct page *page;
1241
1242	list_for_each_entry(page, page_list, lru) {
1243		int numpages = hpage_nr_pages(page);
1244		lru = page_lru_base_type(page);
1245		if (PageActive(page)) {
1246			lru += LRU_ACTIVE;
1247			ClearPageActive(page);
1248			nr_active += numpages;
1249		}
1250		if (count)
1251			count[lru] += numpages;
1252	}
1253
1254	return nr_active;
1255}
1256
1257/**
1258 * isolate_lru_page - tries to isolate a page from its LRU list
1259 * @page: page to isolate from its LRU list
1260 *
1261 * Isolates a @page from an LRU list, clears PageLRU and adjusts the
1262 * vmstat statistic corresponding to whatever LRU list the page was on.
1263 *
1264 * Returns 0 if the page was removed from an LRU list.
1265 * Returns -EBUSY if the page was not on an LRU list.
1266 *
1267 * The returned page will have PageLRU() cleared.  If it was found on
1268 * the active list, it will have PageActive set.  If it was found on
1269 * the unevictable list, it will have the PageUnevictable bit set. That flag
1270 * may need to be cleared by the caller before letting the page go.
1271 *
1272 * The vmstat statistic corresponding to the list on which the page was
1273 * found will be decremented.
1274 *
1275 * Restrictions:
1276 * (1) Must be called with an elevated refcount on the page. This is a
1277 *     fundamentnal difference from isolate_lru_pages (which is called
1278 *     without a stable reference).
1279 * (2) the lru_lock must not be held.
1280 * (3) interrupts must be enabled.
1281 */
1282int isolate_lru_page(struct page *page)
1283{
1284	int ret = -EBUSY;
1285
1286	VM_BUG_ON(!page_count(page));
1287
1288	if (PageLRU(page)) {
1289		struct zone *zone = page_zone(page);
1290
1291		spin_lock_irq(&zone->lru_lock);
1292		if (PageLRU(page)) {
1293			int lru = page_lru(page);
1294			ret = 0;
1295			get_page(page);
1296			ClearPageLRU(page);
1297
1298			del_page_from_lru_list(zone, page, lru);
1299		}
1300		spin_unlock_irq(&zone->lru_lock);
1301	}
1302	return ret;
1303}
1304
1305/*
1306 * Are there way too many processes in the direct reclaim path already?
1307 */
1308static int too_many_isolated(struct zone *zone, int file,
1309		struct scan_control *sc)
1310{
1311	unsigned long inactive, isolated;
1312
1313	if (current_is_kswapd())
1314		return 0;
1315
1316	if (!scanning_global_lru(sc))
1317		return 0;
1318
1319	if (file) {
1320		inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1321		isolated = zone_page_state(zone, NR_ISOLATED_FILE);
1322	} else {
1323		inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1324		isolated = zone_page_state(zone, NR_ISOLATED_ANON);
1325	}
1326
1327	return isolated > inactive;
1328}
1329
1330/*
1331 * TODO: Try merging with migrations version of putback_lru_pages
1332 */
1333static noinline_for_stack void
1334putback_lru_pages(struct zone *zone, struct scan_control *sc,
1335				unsigned long nr_anon, unsigned long nr_file,
1336				struct list_head *page_list)
1337{
1338	struct page *page;
1339	struct pagevec pvec;
1340	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1341
1342	pagevec_init(&pvec, 1);
1343
1344	/*
1345	 * Put back any unfreeable pages.
1346	 */
1347	spin_lock(&zone->lru_lock);
1348	while (!list_empty(page_list)) {
1349		int lru;
1350		page = lru_to_page(page_list);
1351		VM_BUG_ON(PageLRU(page));
1352		list_del(&page->lru);
1353		if (unlikely(!page_evictable(page, NULL))) {
1354			spin_unlock_irq(&zone->lru_lock);
1355			putback_lru_page(page);
1356			spin_lock_irq(&zone->lru_lock);
1357			continue;
1358		}
1359		SetPageLRU(page);
1360		lru = page_lru(page);
1361		add_page_to_lru_list(zone, page, lru);
1362		if (is_active_lru(lru)) {
1363			int file = is_file_lru(lru);
1364			int numpages = hpage_nr_pages(page);
1365			reclaim_stat->recent_rotated[file] += numpages;
1366		}
1367		if (!pagevec_add(&pvec, page)) {
1368			spin_unlock_irq(&zone->lru_lock);
1369			__pagevec_release(&pvec);
1370			spin_lock_irq(&zone->lru_lock);
1371		}
1372	}
1373	__mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1374	__mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1375
1376	spin_unlock_irq(&zone->lru_lock);
1377	pagevec_release(&pvec);
1378}
1379
1380static noinline_for_stack void update_isolated_counts(struct zone *zone,
1381					struct scan_control *sc,
1382					unsigned long *nr_anon,
1383					unsigned long *nr_file,
1384					struct list_head *isolated_list)
1385{
1386	unsigned long nr_active;
1387	unsigned int count[NR_LRU_LISTS] = { 0, };
1388	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1389
1390	nr_active = clear_active_flags(isolated_list, count);
1391	__count_vm_events(PGDEACTIVATE, nr_active);
1392
1393	__mod_zone_page_state(zone, NR_ACTIVE_FILE,
1394			      -count[LRU_ACTIVE_FILE]);
1395	__mod_zone_page_state(zone, NR_INACTIVE_FILE,
1396			      -count[LRU_INACTIVE_FILE]);
1397	__mod_zone_page_state(zone, NR_ACTIVE_ANON,
1398			      -count[LRU_ACTIVE_ANON]);
1399	__mod_zone_page_state(zone, NR_INACTIVE_ANON,
1400			      -count[LRU_INACTIVE_ANON]);
1401
1402	*nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1403	*nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1404	__mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
1405	__mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
1406
1407	reclaim_stat->recent_scanned[0] += *nr_anon;
1408	reclaim_stat->recent_scanned[1] += *nr_file;
1409}
1410
1411/*
1412 * Returns true if a direct reclaim should wait on pages under writeback.
1413 *
1414 * If we are direct reclaiming for contiguous pages and we do not reclaim
1415 * everything in the list, try again and wait for writeback IO to complete.
1416 * This will stall high-order allocations noticeably. Only do that when really
1417 * need to free the pages under high memory pressure.
1418 */
1419static inline bool should_reclaim_stall(unsigned long nr_taken,
1420					unsigned long nr_freed,
1421					int priority,
1422					struct scan_control *sc)
1423{
1424	int lumpy_stall_priority;
1425
1426	/* kswapd should not stall on sync IO */
1427	if (current_is_kswapd())
1428		return false;
1429
1430	/* Only stall on lumpy reclaim */
1431	if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
1432		return false;
1433
1434	/* If we have reclaimed everything on the isolated list, no stall */
1435	if (nr_freed == nr_taken)
1436		return false;
1437
1438	/*
1439	 * For high-order allocations, there are two stall thresholds.
1440	 * High-cost allocations stall immediately where as lower
1441	 * order allocations such as stacks require the scanning
1442	 * priority to be much higher before stalling.
1443	 */
1444	if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1445		lumpy_stall_priority = DEF_PRIORITY;
1446	else
1447		lumpy_stall_priority = DEF_PRIORITY / 3;
1448
1449	return priority <= lumpy_stall_priority;
1450}
1451
1452/*
1453 * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
1454 * of reclaimed pages
1455 */
1456static noinline_for_stack unsigned long
1457shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1458			struct scan_control *sc, int priority, int file)
1459{
1460	LIST_HEAD(page_list);
1461	unsigned long nr_scanned;
1462	unsigned long nr_reclaimed = 0;
1463	unsigned long nr_taken;
1464	unsigned long nr_anon;
1465	unsigned long nr_file;
1466	unsigned long nr_dirty = 0;
1467	unsigned long nr_writeback = 0;
1468	isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
1469
1470	while (unlikely(too_many_isolated(zone, file, sc))) {
1471		congestion_wait(BLK_RW_ASYNC, HZ/10);
1472
1473		/* We are about to die and free our memory. Return now. */
1474		if (fatal_signal_pending(current))
1475			return SWAP_CLUSTER_MAX;
1476	}
1477
1478	set_reclaim_mode(priority, sc, false);
1479	if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
1480		reclaim_mode |= ISOLATE_ACTIVE;
1481
1482	lru_add_drain();
1483
1484	if (!sc->may_unmap)
1485		reclaim_mode |= ISOLATE_UNMAPPED;
1486	if (!sc->may_writepage)
1487		reclaim_mode |= ISOLATE_CLEAN;
1488
1489	spin_lock_irq(&zone->lru_lock);
1490
1491	if (scanning_global_lru(sc)) {
1492		nr_taken = isolate_pages_global(nr_to_scan, &page_list,
1493			&nr_scanned, sc->order, reclaim_mode, zone, 0, file);
1494		zone->pages_scanned += nr_scanned;
1495		if (current_is_kswapd())
1496			__count_zone_vm_events(PGSCAN_KSWAPD, zone,
1497					       nr_scanned);
1498		else
1499			__count_zone_vm_events(PGSCAN_DIRECT, zone,
1500					       nr_scanned);
1501	} else {
1502		nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
1503			&nr_scanned, sc->order, reclaim_mode, zone,
1504			sc->mem_cgroup, 0, file);
1505		/*
1506		 * mem_cgroup_isolate_pages() keeps track of
1507		 * scanned pages on its own.
1508		 */
1509	}
1510
1511	if (nr_taken == 0) {
1512		spin_unlock_irq(&zone->lru_lock);
1513		return 0;
1514	}
1515
1516	update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
1517
1518	spin_unlock_irq(&zone->lru_lock);
1519
1520	nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority,
1521						&nr_dirty, &nr_writeback);
1522
1523	/* Check if we should syncronously wait for writeback */
1524	if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1525		set_reclaim_mode(priority, sc, true);
1526		nr_reclaimed += shrink_page_list(&page_list, zone, sc,
1527					priority, &nr_dirty, &nr_writeback);
1528	}
1529
1530	local_irq_disable();
1531	if (current_is_kswapd())
1532		__count_vm_events(KSWAPD_STEAL, nr_reclaimed);
1533	__count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
1534
1535	putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
1536
1537	/*
1538	 * If reclaim is isolating dirty pages under writeback, it implies
1539	 * that the long-lived page allocation rate is exceeding the page
1540	 * laundering rate. Either the global limits are not being effective
1541	 * at throttling processes due to the page distribution throughout
1542	 * zones or there is heavy usage of a slow backing device. The
1543	 * only option is to throttle from reclaim context which is not ideal
1544	 * as there is no guarantee the dirtying process is throttled in the
1545	 * same way balance_dirty_pages() manages.
1546	 *
1547	 * This scales the number of dirty pages that must be under writeback
1548	 * before throttling depending on priority. It is a simple backoff
1549	 * function that has the most effect in the range DEF_PRIORITY to
1550	 * DEF_PRIORITY-2 which is the priority reclaim is considered to be
1551	 * in trouble and reclaim is considered to be in trouble.
1552	 *
1553	 * DEF_PRIORITY   100% isolated pages must be PageWriteback to throttle
1554	 * DEF_PRIORITY-1  50% must be PageWriteback
1555	 * DEF_PRIORITY-2  25% must be PageWriteback, kswapd in trouble
1556	 * ...
1557	 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
1558	 *                     isolated page is PageWriteback
1559	 */
1560	if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
1561		wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1562
1563	trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1564		zone_idx(zone),
1565		nr_scanned, nr_reclaimed,
1566		priority,
1567		trace_shrink_flags(file, sc->reclaim_mode));
1568	return nr_reclaimed;
1569}
1570
1571/*
1572 * This moves pages from the active list to the inactive list.
1573 *
1574 * We move them the other way if the page is referenced by one or more
1575 * processes, from rmap.
1576 *
1577 * If the pages are mostly unmapped, the processing is fast and it is
1578 * appropriate to hold zone->lru_lock across the whole operation.  But if
1579 * the pages are mapped, the processing is slow (page_referenced()) so we
1580 * should drop zone->lru_lock around each page.  It's impossible to balance
1581 * this, so instead we remove the pages from the LRU while processing them.
1582 * It is safe to rely on PG_active against the non-LRU pages in here because
1583 * nobody will play with that bit on a non-LRU page.
1584 *
1585 * The downside is that we have to touch page->_count against each page.
1586 * But we had to alter page->flags anyway.
1587 */
1588
1589static void move_active_pages_to_lru(struct zone *zone,
1590				     struct list_head *list,
1591				     enum lru_list lru)
1592{
1593	unsigned long pgmoved = 0;
1594	struct pagevec pvec;
1595	struct page *page;
1596
1597	pagevec_init(&pvec, 1);
1598
1599	while (!list_empty(list)) {
1600		page = lru_to_page(list);
1601
1602		VM_BUG_ON(PageLRU(page));
1603		SetPageLRU(page);
1604
1605		list_move(&page->lru, &zone->lru[lru].list);
1606		mem_cgroup_add_lru_list(page, lru);
1607		pgmoved += hpage_nr_pages(page);
1608
1609		if (!pagevec_add(&pvec, page) || list_empty(list)) {
1610			spin_unlock_irq(&zone->lru_lock);
1611			if (buffer_heads_over_limit)
1612				pagevec_strip(&pvec);
1613			__pagevec_release(&pvec);
1614			spin_lock_irq(&zone->lru_lock);
1615		}
1616	}
1617	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1618	if (!is_active_lru(lru))
1619		__count_vm_events(PGDEACTIVATE, pgmoved);
1620}
1621
1622static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1623			struct scan_control *sc, int priority, int file)
1624{
1625	unsigned long nr_taken;
1626	unsigned long pgscanned;
1627	unsigned long vm_flags;
1628	LIST_HEAD(l_hold);	/* The pages which were snipped off */
1629	LIST_HEAD(l_active);
1630	LIST_HEAD(l_inactive);
1631	struct page *page;
1632	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1633	unsigned long nr_rotated = 0;
1634	isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
1635
1636	lru_add_drain();
1637
1638	if (!sc->may_unmap)
1639		reclaim_mode |= ISOLATE_UNMAPPED;
1640	if (!sc->may_writepage)
1641		reclaim_mode |= ISOLATE_CLEAN;
1642
1643	spin_lock_irq(&zone->lru_lock);
1644	if (scanning_global_lru(sc)) {
1645		nr_taken = isolate_pages_global(nr_pages, &l_hold,
1646						&pgscanned, sc->order,
1647						reclaim_mode, zone,
1648						1, file);
1649		zone->pages_scanned += pgscanned;
1650	} else {
1651		nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
1652						&pgscanned, sc->order,
1653						reclaim_mode, zone,
1654						sc->mem_cgroup, 1, file);
1655		/*
1656		 * mem_cgroup_isolate_pages() keeps track of
1657		 * scanned pages on its own.
1658		 */
1659	}
1660
1661	reclaim_stat->recent_scanned[file] += nr_taken;
1662
1663	__count_zone_vm_events(PGREFILL, zone, pgscanned);
1664	if (file)
1665		__mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
1666	else
1667		__mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
1668	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1669	spin_unlock_irq(&zone->lru_lock);
1670
1671	while (!list_empty(&l_hold)) {
1672		cond_resched();
1673		page = lru_to_page(&l_hold);
1674		list_del(&page->lru);
1675
1676		if (unlikely(!page_evictable(page, NULL))) {
1677			putback_lru_page(page);
1678			continue;
1679		}
1680
1681		if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1682			nr_rotated += hpage_nr_pages(page);
1683			/*
1684			 * Identify referenced, file-backed active pages and
1685			 * give them one more trip around the active list. So
1686			 * that executable code get better chances to stay in
1687			 * memory under moderate memory pressure.  Anon pages
1688			 * are not likely to be evicted by use-once streaming
1689			 * IO, plus JVM can create lots of anon VM_EXEC pages,
1690			 * so we ignore them here.
1691			 */
1692			if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
1693				list_add(&page->lru, &l_active);
1694				continue;
1695			}
1696		}
1697
1698		ClearPageActive(page);	/* we are de-activating */
1699		list_add(&page->lru, &l_inactive);
1700	}
1701
1702	/*
1703	 * Move pages back to the lru list.
1704	 */
1705	spin_lock_irq(&zone->lru_lock);
1706	/*
1707	 * Count referenced pages from currently used mappings as rotated,
1708	 * even though only some of them are actually re-activated.  This
1709	 * helps balance scan pressure between file and anonymous pages in
1710	 * get_scan_ratio.
1711	 */
1712	reclaim_stat->recent_rotated[file] += nr_rotated;
1713
1714	move_active_pages_to_lru(zone, &l_active,
1715						LRU_ACTIVE + file * LRU_FILE);
1716	move_active_pages_to_lru(zone, &l_inactive,
1717						LRU_BASE   + file * LRU_FILE);
1718	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1719	spin_unlock_irq(&zone->lru_lock);
1720}
1721
1722#ifdef CONFIG_SWAP
1723static int inactive_anon_is_low_global(struct zone *zone)
1724{
1725	unsigned long active, inactive;
1726
1727	active = zone_page_state(zone, NR_ACTIVE_ANON);
1728	inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1729
1730	if (inactive * zone->inactive_ratio < active)
1731		return 1;
1732
1733	return 0;
1734}
1735
1736/**
1737 * inactive_anon_is_low - check if anonymous pages need to be deactivated
1738 * @zone: zone to check
1739 * @sc:   scan control of this context
1740 *
1741 * Returns true if the zone does not have enough inactive anon pages,
1742 * meaning some active anon pages need to be deactivated.
1743 */
1744static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1745{
1746	int low;
1747
1748	/*
1749	 * If we don't have swap space, anonymous page deactivation
1750	 * is pointless.
1751	 */
1752	if (!total_swap_pages)
1753		return 0;
1754
1755	if (scanning_global_lru(sc))
1756		low = inactive_anon_is_low_global(zone);
1757	else
1758		low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone);
1759	return low;
1760}
1761#else
1762static inline int inactive_anon_is_low(struct zone *zone,
1763					struct scan_control *sc)
1764{
1765	return 0;
1766}
1767#endif
1768
1769static int inactive_file_is_low_global(struct zone *zone)
1770{
1771	unsigned long active, inactive;
1772
1773	active = zone_page_state(zone, NR_ACTIVE_FILE);
1774	inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1775
1776	return (active > inactive);
1777}
1778
1779/**
1780 * inactive_file_is_low - check if file pages need to be deactivated
1781 * @zone: zone to check
1782 * @sc:   scan control of this context
1783 *
1784 * When the system is doing streaming IO, memory pressure here
1785 * ensures that active file pages get deactivated, until more
1786 * than half of the file pages are on the inactive list.
1787 *
1788 * Once we get to that situation, protect the system's working
1789 * set from being evicted by disabling active file page aging.
1790 *
1791 * This uses a different ratio than the anonymous pages, because
1792 * the page cache uses a use-once replacement algorithm.
1793 */
1794static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
1795{
1796	int low;
1797
1798	if (scanning_global_lru(sc))
1799		low = inactive_file_is_low_global(zone);
1800	else
1801		low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone);
1802	return low;
1803}
1804
1805static int inactive_list_is_low(struct zone *zone, struct scan_control *sc,
1806				int file)
1807{
1808	if (file)
1809		return inactive_file_is_low(zone, sc);
1810	else
1811		return inactive_anon_is_low(zone, sc);
1812}
1813
1814static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1815	struct zone *zone, struct scan_control *sc, int priority)
1816{
1817	int file = is_file_lru(lru);
1818
1819	if (is_active_lru(lru)) {
1820		if (inactive_list_is_low(zone, sc, file))
1821		    shrink_active_list(nr_to_scan, zone, sc, priority, file);
1822		return 0;
1823	}
1824
1825	return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
1826}
1827
1828static int vmscan_swappiness(struct scan_control *sc)
1829{
1830	if (scanning_global_lru(sc))
1831		return vm_swappiness;
1832	return mem_cgroup_swappiness(sc->mem_cgroup);
1833}
1834
1835/*
1836 * Determine how aggressively the anon and file LRU lists should be
1837 * scanned.  The relative value of each set of LRU lists is determined
1838 * by looking at the fraction of the pages scanned we did rotate back
1839 * onto the active list instead of evict.
1840 *
1841 * nr[0] = anon pages to scan; nr[1] = file pages to scan
1842 */
1843static void get_scan_count(struct zone *zone, struct scan_control *sc,
1844					unsigned long *nr, int priority)
1845{
1846	unsigned long anon, file, free;
1847	unsigned long anon_prio, file_prio;
1848	unsigned long ap, fp;
1849	struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1850	u64 fraction[2], denominator;
1851	enum lru_list l;
1852	int noswap = 0;
1853	bool force_scan = false;
1854
1855	/*
1856	 * If the zone or memcg is small, nr[l] can be 0.  This
1857	 * results in no scanning on this priority and a potential
1858	 * priority drop.  Global direct reclaim can go to the next
1859	 * zone and tends to have no problems. Global kswapd is for
1860	 * zone balancing and it needs to scan a minimum amount. When
1861	 * reclaiming for a memcg, a priority drop can cause high
1862	 * latencies, so it's better to scan a minimum amount there as
1863	 * well.
1864	 */
1865	if (scanning_global_lru(sc) && current_is_kswapd())
1866		force_scan = true;
1867	if (!scanning_global_lru(sc))
1868		force_scan = true;
1869
1870	/* If we have no swap space, do not bother scanning anon pages. */
1871	if (!sc->may_swap || (nr_swap_pages <= 0)) {
1872		noswap = 1;
1873		fraction[0] = 0;
1874		fraction[1] = 1;
1875		denominator = 1;
1876		goto out;
1877	}
1878
1879	anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1880		zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1881	file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1882		zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1883
1884	if (scanning_global_lru(sc)) {
1885		free  = zone_page_state(zone, NR_FREE_PAGES);
1886		/* If we have very few page cache pages,
1887		   force-scan anon pages. */
1888		if (unlikely(file + free <= high_wmark_pages(zone))) {
1889			fraction[0] = 1;
1890			fraction[1] = 0;
1891			denominator = 1;
1892			goto out;
1893		}
1894	}
1895
1896	/*
1897	 * With swappiness at 100, anonymous and file have the same priority.
1898	 * This scanning priority is essentially the inverse of IO cost.
1899	 */
1900	anon_prio = vmscan_swappiness(sc);
1901	file_prio = 200 - vmscan_swappiness(sc);
1902
1903	/*
1904	 * OK, so we have swap space and a fair amount of page cache
1905	 * pages.  We use the recently rotated / recently scanned
1906	 * ratios to determine how valuable each cache is.
1907	 *
1908	 * Because workloads change over time (and to avoid overflow)
1909	 * we keep these statistics as a floating average, which ends
1910	 * up weighing recent references more than old ones.
1911	 *
1912	 * anon in [0], file in [1]
1913	 */
1914	spin_lock_irq(&zone->lru_lock);
1915	if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1916		reclaim_stat->recent_scanned[0] /= 2;
1917		reclaim_stat->recent_rotated[0] /= 2;
1918	}
1919
1920	if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
1921		reclaim_stat->recent_scanned[1] /= 2;
1922		reclaim_stat->recent_rotated[1] /= 2;
1923	}
1924
1925	/*
1926	 * The amount of pressure on anon vs file pages is inversely
1927	 * proportional to the fraction of recently scanned pages on
1928	 * each list that were recently referenced and in active use.
1929	 */
1930	ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
1931	ap /= reclaim_stat->recent_rotated[0] + 1;
1932
1933	fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
1934	fp /= reclaim_stat->recent_rotated[1] + 1;
1935	spin_unlock_irq(&zone->lru_lock);
1936
1937	fraction[0] = ap;
1938	fraction[1] = fp;
1939	denominator = ap + fp + 1;
1940out:
1941	for_each_evictable_lru(l) {
1942		int file = is_file_lru(l);
1943		unsigned long scan;
1944
1945		scan = zone_nr_lru_pages(zone, sc, l);
1946		if (priority || noswap) {
1947			scan >>= priority;
1948			if (!scan && force_scan)
1949				scan = SWAP_CLUSTER_MAX;
1950			scan = div64_u64(scan * fraction[file], denominator);
1951		}
1952		nr[l] = scan;
1953	}
1954}
1955
1956/*
1957 * Reclaim/compaction depends on a number of pages being freed. To avoid
1958 * disruption to the system, a small number of order-0 pages continue to be
1959 * rotated and reclaimed in the normal fashion. However, by the time we get
1960 * back to the allocator and call try_to_compact_zone(), we ensure that
1961 * there are enough free pages for it to be likely successful
1962 */
1963static inline bool should_continue_reclaim(struct zone *zone,
1964					unsigned long nr_reclaimed,
1965					unsigned long nr_scanned,
1966					struct scan_control *sc)
1967{
1968	unsigned long pages_for_compaction;
1969	unsigned long inactive_lru_pages;
1970
1971	/* If not in reclaim/compaction mode, stop */
1972	if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
1973		return false;
1974
1975	/* Consider stopping depending on scan and reclaim activity */
1976	if (sc->gfp_mask & __GFP_REPEAT) {
1977		/*
1978		 * For __GFP_REPEAT allocations, stop reclaiming if the
1979		 * full LRU list has been scanned and we are still failing
1980		 * to reclaim pages. This full LRU scan is potentially
1981		 * expensive but a __GFP_REPEAT caller really wants to succeed
1982		 */
1983		if (!nr_reclaimed && !nr_scanned)
1984			return false;
1985	} else {
1986		/*
1987		 * For non-__GFP_REPEAT allocations which can presumably
1988		 * fail without consequence, stop if we failed to reclaim
1989		 * any pages from the last SWAP_CLUSTER_MAX number of
1990		 * pages that were scanned. This will return to the
1991		 * caller faster at the risk reclaim/compaction and
1992		 * the resulting allocation attempt fails
1993		 */
1994		if (!nr_reclaimed)
1995			return false;
1996	}
1997
1998	/*
1999	 * If we have not reclaimed enough pages for compaction and the
2000	 * inactive lists are large enough, continue reclaiming
2001	 */
2002	pages_for_compaction = (2UL << sc->order);
2003	inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
2004				zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
2005	if (sc->nr_reclaimed < pages_for_compaction &&
2006			inactive_lru_pages > pages_for_compaction)
2007		return true;
2008
2009	/* If compaction would go ahead or the allocation would succeed, stop */
2010	switch (compaction_suitable(zone, sc->order)) {
2011	case COMPACT_PARTIAL:
2012	case COMPACT_CONTINUE:
2013		return false;
2014	default:
2015		return true;
2016	}
2017}
2018
2019/*
2020 * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
2021 */
2022static void shrink_zone(int priority, struct zone *zone,
2023				struct scan_control *sc)
2024{
2025	unsigned long nr[NR_LRU_LISTS];
2026	unsigned long nr_to_scan;
2027	enum lru_list l;
2028	unsigned long nr_reclaimed, nr_scanned;
2029	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2030	struct blk_plug plug;
2031
2032restart:
2033	nr_reclaimed = 0;
2034	nr_scanned = sc->nr_scanned;
2035	get_scan_count(zone, sc, nr, priority);
2036
2037	blk_start_plug(&plug);
2038	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2039					nr[LRU_INACTIVE_FILE]) {
2040		for_each_evictable_lru(l) {
2041			if (nr[l]) {
2042				nr_to_scan = min_t(unsigned long,
2043						   nr[l], SWAP_CLUSTER_MAX);
2044				nr[l] -= nr_to_scan;
2045
2046				nr_reclaimed += shrink_list(l, nr_to_scan,
2047							    zone, sc, priority);
2048			}
2049		}
2050		/*
2051		 * On large memory systems, scan >> priority can become
2052		 * really large. This is fine for the starting priority;
2053		 * we want to put equal scanning pressure on each zone.
2054		 * However, if the VM has a harder time of freeing pages,
2055		 * with multiple processes reclaiming pages, the total
2056		 * freeing target can get unreasonably large.
2057		 */
2058		if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
2059			break;
2060	}
2061	blk_finish_plug(&plug);
2062	sc->nr_reclaimed += nr_reclaimed;
2063
2064	/*
2065	 * Even if we did not try to evict anon pages at all, we want to
2066	 * rebalance the anon lru active/inactive ratio.
2067	 */
2068	if (inactive_anon_is_low(zone, sc))
2069		shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
2070
2071	/* reclaim/compaction might need reclaim to continue */
2072	if (should_continue_reclaim(zone, nr_reclaimed,
2073					sc->nr_scanned - nr_scanned, sc))
2074		goto restart;
2075
2076	throttle_vm_writeout(sc->gfp_mask);
2077}
2078
2079/*
2080 * This is the direct reclaim path, for page-allocating processes.  We only
2081 * try to reclaim pages from zones which will satisfy the caller's allocation
2082 * request.
2083 *
2084 * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
2085 * Because:
2086 * a) The caller may be trying to free *extra* pages to satisfy a higher-order
2087 *    allocation or
2088 * b) The target zone may be at high_wmark_pages(zone) but the lower zones
2089 *    must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
2090 *    zone defense algorithm.
2091 *
2092 * If a zone is deemed to be full of pinned pages then just give it a light
2093 * scan then give up on it.
2094 *
2095 * This function returns true if a zone is being reclaimed for a costly
2096 * high-order allocation and compaction is either ready to begin or deferred.
2097 * This indicates to the caller that it should retry the allocation or fail.
2098 */
2099static bool shrink_zones(int priority, struct zonelist *zonelist,
2100					struct scan_control *sc)
2101{
2102	struct zoneref *z;
2103	struct zone *zone;
2104	unsigned long nr_soft_reclaimed;
2105	unsigned long nr_soft_scanned;
2106	bool should_abort_reclaim = false;
2107
2108	for_each_zone_zonelist_nodemask(zone, z, zonelist,
2109					gfp_zone(sc->gfp_mask), sc->nodemask) {
2110		if (!populated_zone(zone))
2111			continue;
2112		/*
2113		 * Take care memory controller reclaiming has small influence
2114		 * to global LRU.
2115		 */
2116		if (scanning_global_lru(sc)) {
2117			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2118				continue;
2119			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2120				continue;	/* Let kswapd poll it */
2121			if (COMPACTION_BUILD) {
2122				/*
2123				 * If we already have plenty of memory free for
2124				 * compaction in this zone, don't free any more.
2125				 * Even though compaction is invoked for any
2126				 * non-zero order, only frequent costly order
2127				 * reclamation is disruptive enough to become a
2128				 * noticable problem, like transparent huge page
2129				 * allocations.
2130				 */
2131				if (sc->order > PAGE_ALLOC_COSTLY_ORDER &&
2132					(compaction_suitable(zone, sc->order) ||
2133					 compaction_deferred(zone))) {
2134					should_abort_reclaim = true;
2135					continue;
2136				}
2137			}
2138			/*
2139			 * This steals pages from memory cgroups over softlimit
2140			 * and returns the number of reclaimed pages and
2141			 * scanned pages. This works for global memory pressure
2142			 * and balancing, not for a memcg's limit.
2143			 */
2144			nr_soft_scanned = 0;
2145			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2146						sc->order, sc->gfp_mask,
2147						&nr_soft_scanned);
2148			sc->nr_reclaimed += nr_soft_reclaimed;
2149			sc->nr_scanned += nr_soft_scanned;
2150			/* need some check for avoid more shrink_zone() */
2151		}
2152
2153		shrink_zone(priority, zone, sc);
2154	}
2155
2156	return should_abort_reclaim;
2157}
2158
2159static bool zone_reclaimable(struct zone *zone)
2160{
2161	return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
2162}
2163
2164/* All zones in zonelist are unreclaimable? */
2165static bool all_unreclaimable(struct zonelist *zonelist,
2166		struct scan_control *sc)
2167{
2168	struct zoneref *z;
2169	struct zone *zone;
2170
2171	for_each_zone_zonelist_nodemask(zone, z, zonelist,
2172			gfp_zone(sc->gfp_mask), sc->nodemask) {
2173		if (!populated_zone(zone))
2174			continue;
2175		if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2176			continue;
2177		if (!zone->all_unreclaimable)
2178			return false;
2179	}
2180
2181	return true;
2182}
2183
2184/*
2185 * This is the main entry point to direct page reclaim.
2186 *
2187 * If a full scan of the inactive list fails to free enough memory then we
2188 * are "out of memory" and something needs to be killed.
2189 *
2190 * If the caller is !__GFP_FS then the probability of a failure is reasonably
2191 * high - the zone may be full of dirty or under-writeback pages, which this
2192 * caller can't do much about.  We kick the writeback threads and take explicit
2193 * naps in the hope that some of these pages can be written.  But if the
2194 * allocating task holds filesystem locks which prevent writeout this might not
2195 * work, and the allocation attempt will fail.
2196 *
2197 * returns:	0, if no pages reclaimed
2198 * 		else, the number of pages reclaimed
2199 */
2200static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2201					struct scan_control *sc,
2202					struct shrink_control *shrink)
2203{
2204	int priority;
2205	unsigned long total_scanned = 0;
2206	struct reclaim_state *reclaim_state = current->reclaim_state;
2207	struct zoneref *z;
2208	struct zone *zone;
2209	unsigned long writeback_threshold;
2210
2211	get_mems_allowed();
2212	delayacct_freepages_start();
2213
2214	if (scanning_global_lru(sc))
2215		count_vm_event(ALLOCSTALL);
2216
2217	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2218		sc->nr_scanned = 0;
2219		if (!priority)
2220			disable_swap_token(sc->mem_cgroup);
2221		if (shrink_zones(priority, zonelist, sc))
2222			break;
2223
2224		/*
2225		 * Don't shrink slabs when reclaiming memory from
2226		 * over limit cgroups
2227		 */
2228		if (scanning_global_lru(sc)) {
2229			unsigned long lru_pages = 0;
2230			for_each_zone_zonelist(zone, z, zonelist,
2231					gfp_zone(sc->gfp_mask)) {
2232				if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2233					continue;
2234
2235				lru_pages += zone_reclaimable_pages(zone);
2236			}
2237
2238			shrink_slab(shrink, sc->nr_scanned, lru_pages);
2239			if (reclaim_state) {
2240				sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2241				reclaim_state->reclaimed_slab = 0;
2242			}
2243		}
2244		total_scanned += sc->nr_scanned;
2245		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
2246			goto out;
2247
2248		/*
2249		 * Try to write back as many pages as we just scanned.  This
2250		 * tends to cause slow streaming writers to write data to the
2251		 * disk smoothly, at the dirtying rate, which is nice.   But
2252		 * that's undesirable in laptop mode, where we *want* lumpy
2253		 * writeout.  So in laptop mode, write out the whole world.
2254		 */
2255		writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
2256		if (total_scanned > writeback_threshold) {
2257			wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
2258						WB_REASON_TRY_TO_FREE_PAGES);
2259			sc->may_writepage = 1;
2260		}
2261
2262		/* Take a nap, wait for some writeback to complete */
2263		if (!sc->hibernation_mode && sc->nr_scanned &&
2264		    priority < DEF_PRIORITY - 2) {
2265			struct zone *preferred_zone;
2266
2267			first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2268						&cpuset_current_mems_allowed,
2269						&preferred_zone);
2270			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2271		}
2272	}
2273
2274out:
2275	delayacct_freepages_end();
2276	put_mems_allowed();
2277
2278	if (sc->nr_reclaimed)
2279		return sc->nr_reclaimed;
2280
2281	/*
2282	 * As hibernation is going on, kswapd is freezed so that it can't mark
2283	 * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
2284	 * check.
2285	 */
2286	if (oom_killer_disabled)
2287		return 0;
2288
2289	/* top priority shrink_zones still had more to do? don't OOM, then */
2290	if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
2291		return 1;
2292
2293	return 0;
2294}
2295
2296unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2297				gfp_t gfp_mask, nodemask_t *nodemask)
2298{
2299	unsigned long nr_reclaimed;
2300	struct scan_control sc = {
2301		.gfp_mask = gfp_mask,
2302		.may_writepage = !laptop_mode,
2303		.nr_to_reclaim = SWAP_CLUSTER_MAX,
2304		.may_unmap = 1,
2305		.may_swap = 1,
2306		.order = order,
2307		.mem_cgroup = NULL,
2308		.nodemask = nodemask,
2309	};
2310	struct shrink_control shrink = {
2311		.gfp_mask = sc.gfp_mask,
2312	};
2313
2314	trace_mm_vmscan_direct_reclaim_begin(order,
2315				sc.may_writepage,
2316				gfp_mask);
2317
2318	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2319
2320	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
2321
2322	return nr_reclaimed;
2323}
2324
2325#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2326
2327unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2328						gfp_t gfp_mask, bool noswap,
2329						struct zone *zone,
2330						unsigned long *nr_scanned)
2331{
2332	struct scan_control sc = {
2333		.nr_scanned = 0,
2334		.nr_to_reclaim = SWAP_CLUSTER_MAX,
2335		.may_writepage = !laptop_mode,
2336		.may_unmap = 1,
2337		.may_swap = !noswap,
2338		.order = 0,
2339		.mem_cgroup = mem,
2340	};
2341
2342	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2343			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
2344
2345	trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
2346						      sc.may_writepage,
2347						      sc.gfp_mask);
2348
2349	/*
2350	 * NOTE: Although we can get the priority field, using it
2351	 * here is not a good idea, since it limits the pages we can scan.
2352	 * if we don't reclaim here, the shrink_zone from balance_pgdat
2353	 * will pick up pages from other mem cgroup's as well. We hack
2354	 * the priority and make it zero.
2355	 */
2356	shrink_zone(0, zone, &sc);
2357
2358	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2359
2360	*nr_scanned = sc.nr_scanned;
2361	return sc.nr_reclaimed;
2362}
2363
2364unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2365					   gfp_t gfp_mask,
2366					   bool noswap)
2367{
2368	struct zonelist *zonelist;
2369	unsigned long nr_reclaimed;
2370	int nid;
2371	struct scan_control sc = {
2372		.may_writepage = !laptop_mode,
2373		.may_unmap = 1,
2374		.may_swap = !noswap,
2375		.nr_to_reclaim = SWAP_CLUSTER_MAX,
2376		.order = 0,
2377		.mem_cgroup = mem_cont,
2378		.nodemask = NULL, /* we don't care the placement */
2379		.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2380				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
2381	};
2382	struct shrink_control shrink = {
2383		.gfp_mask = sc.gfp_mask,
2384	};
2385
2386	/*
2387	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
2388	 * take care of from where we get pages. So the node where we start the
2389	 * scan does not need to be the current node.
2390	 */
2391	nid = mem_cgroup_select_victim_node(mem_cont);
2392
2393	zonelist = NODE_DATA(nid)->node_zonelists;
2394
2395	trace_mm_vmscan_memcg_reclaim_begin(0,
2396					    sc.may_writepage,
2397					    sc.gfp_mask);
2398
2399	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2400
2401	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2402
2403	return nr_reclaimed;
2404}
2405#endif
2406
2407/*
2408 * pgdat_balanced is used when checking if a node is balanced for high-order
2409 * allocations. Only zones that meet watermarks and are in a zone allowed
2410 * by the callers classzone_idx are added to balanced_pages. The total of
2411 * balanced pages must be at least 25% of the zones allowed by classzone_idx
2412 * for the node to be considered balanced. Forcing all zones to be balanced
2413 * for high orders can cause excessive reclaim when there are imbalanced zones.
2414 * The choice of 25% is due to
2415 *   o a 16M DMA zone that is balanced will not balance a zone on any
2416 *     reasonable sized machine
2417 *   o On all other machines, the top zone must be at least a reasonable
2418 *     percentage of the middle zones. For example, on 32-bit x86, highmem
2419 *     would need to be at least 256M for it to be balance a whole node.
2420 *     Similarly, on x86-64 the Normal zone would need to be at least 1G
2421 *     to balance a node on its own. These seemed like reasonable ratios.
2422 */
2423static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2424						int classzone_idx)
2425{
2426	unsigned long present_pages = 0;
2427	int i;
2428
2429	for (i = 0; i <= classzone_idx; i++)
2430		present_pages += pgdat->node_zones[i].present_pages;
2431
2432	/* A special case here: if zone has no page, we think it's balanced */
2433	return balanced_pages >= (present_pages >> 2);
2434}
2435
2436/* is kswapd sleeping prematurely? */
2437static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2438					int classzone_idx)
2439{
2440	int i;
2441	unsigned long balanced = 0;
2442	bool all_zones_ok = true;
2443
2444	/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2445	if (remaining)
2446		return true;
2447
2448	/* Check the watermark levels */
2449	for (i = 0; i <= classzone_idx; i++) {
2450		struct zone *zone = pgdat->node_zones + i;
2451
2452		if (!populated_zone(zone))
2453			continue;
2454
2455		/*
2456		 * balance_pgdat() skips over all_unreclaimable after
2457		 * DEF_PRIORITY. Effectively, it considers them balanced so
2458		 * they must be considered balanced here as well if kswapd
2459		 * is to sleep
2460		 */
2461		if (zone->all_unreclaimable) {
2462			balanced += zone->present_pages;
2463			continue;
2464		}
2465
2466		if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
2467							i, 0))
2468			all_zones_ok = false;
2469		else
2470			balanced += zone->present_pages;
2471	}
2472
2473	/*
2474	 * For high-order requests, the balanced zones must contain at least
2475	 * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
2476	 * must be balanced
2477	 */
2478	if (order)
2479		return !pgdat_balanced(pgdat, balanced, classzone_idx);
2480	else
2481		return !all_zones_ok;
2482}
2483
2484/*
2485 * For kswapd, balance_pgdat() will work across all this node's zones until
2486 * they are all at high_wmark_pages(zone).
2487 *
2488 * Returns the final order kswapd was reclaiming at
2489 *
2490 * There is special handling here for zones which are full of pinned pages.
2491 * This can happen if the pages are all mlocked, or if they are all used by
2492 * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
2493 * What we do is to detect the case where all pages in the zone have been
2494 * scanned twice and there has been zero successful reclaim.  Mark the zone as
2495 * dead and from now on, only perform a short scan.  Basically we're polling
2496 * the zone for when the problem goes away.
2497 *
2498 * kswapd scans the zones in the highmem->normal->dma direction.  It skips
2499 * zones which have free_pages > high_wmark_pages(zone), but once a zone is
2500 * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
2501 * lower zones regardless of the number of free pages in the lower zones. This
2502 * interoperates with the page allocator fallback scheme to ensure that aging
2503 * of pages is balanced across the zones.
2504 */
2505static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2506							int *classzone_idx)
2507{
2508	int all_zones_ok;
2509	unsigned long balanced;
2510	int priority;
2511	int i;
2512	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
2513	unsigned long total_scanned;
2514	struct reclaim_state *reclaim_state = current->reclaim_state;
2515	unsigned long nr_soft_reclaimed;
2516	unsigned long nr_soft_scanned;
2517	struct scan_control sc = {
2518		.gfp_mask = GFP_KERNEL,
2519		.may_unmap = 1,
2520		.may_swap = 1,
2521		/*
2522		 * kswapd doesn't want to be bailed out while reclaim. because
2523		 * we want to put equal scanning pressure on each zone.
2524		 */
2525		.nr_to_reclaim = ULONG_MAX,
2526		.order = order,
2527		.mem_cgroup = NULL,
2528	};
2529	struct shrink_control shrink = {
2530		.gfp_mask = sc.gfp_mask,
2531	};
2532loop_again:
2533	total_scanned = 0;
2534	sc.nr_reclaimed = 0;
2535	sc.may_writepage = !laptop_mode;
2536	count_vm_event(PAGEOUTRUN);
2537
2538	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2539		unsigned long lru_pages = 0;
2540		int has_under_min_watermark_zone = 0;
2541
2542		/* The swap token gets in the way of swapout... */
2543		if (!priority)
2544			disable_swap_token(NULL);
2545
2546		all_zones_ok = 1;
2547		balanced = 0;
2548
2549		/*
2550		 * Scan in the highmem->dma direction for the highest
2551		 * zone which needs scanning
2552		 */
2553		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
2554			struct zone *zone = pgdat->node_zones + i;
2555
2556			if (!populated_zone(zone))
2557				continue;
2558
2559			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2560				continue;
2561
2562			/*
2563			 * Do some background aging of the anon list, to give
2564			 * pages a chance to be referenced before reclaiming.
2565			 */
2566			if (inactive_anon_is_low(zone, &sc))
2567				shrink_active_list(SWAP_CLUSTER_MAX, zone,
2568							&sc, priority, 0);
2569
2570			if (!zone_watermark_ok_safe(zone, order,
2571					high_wmark_pages(zone), 0, 0)) {
2572				end_zone = i;
2573				break;
2574			} else {
2575				/* If balanced, clear the congested flag */
2576				zone_clear_flag(zone, ZONE_CONGESTED);
2577			}
2578		}
2579		if (i < 0)
2580			goto out;
2581
2582		for (i = 0; i <= end_zone; i++) {
2583			struct zone *zone = pgdat->node_zones + i;
2584
2585			lru_pages += zone_reclaimable_pages(zone);
2586		}
2587
2588		/*
2589		 * Now scan the zone in the dma->highmem direction, stopping
2590		 * at the last zone which needs scanning.
2591		 *
2592		 * We do this because the page allocator works in the opposite
2593		 * direction.  This prevents the page allocator from allocating
2594		 * pages behind kswapd's direction of progress, which would
2595		 * cause too much scanning of the lower zones.
2596		 */
2597		for (i = 0; i <= end_zone; i++) {
2598			struct zone *zone = pgdat->node_zones + i;
2599			int nr_slab;
2600			unsigned long balance_gap;
2601
2602			if (!populated_zone(zone))
2603				continue;
2604
2605			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2606				continue;
2607
2608			sc.nr_scanned = 0;
2609
2610			nr_soft_scanned = 0;
2611			/*
2612			 * Call soft limit reclaim before calling shrink_zone.
2613			 */
2614			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2615							order, sc.gfp_mask,
2616							&nr_soft_scanned);
2617			sc.nr_reclaimed += nr_soft_reclaimed;
2618			total_scanned += nr_soft_scanned;
2619
2620			/*
2621			 * We put equal pressure on every zone, unless
2622			 * one zone has way too many pages free
2623			 * already. The "too many pages" is defined
2624			 * as the high wmark plus a "gap" where the
2625			 * gap is either the low watermark or 1%
2626			 * of the zone, whichever is smaller.
2627			 */
2628			balance_gap = min(low_wmark_pages(zone),
2629				(zone->present_pages +
2630					KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2631				KSWAPD_ZONE_BALANCE_GAP_RATIO);
2632			if (!zone_watermark_ok_safe(zone, order,
2633					high_wmark_pages(zone) + balance_gap,
2634					end_zone, 0)) {
2635				shrink_zone(priority, zone, &sc);
2636
2637				reclaim_state->reclaimed_slab = 0;
2638				nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
2639				sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2640				total_scanned += sc.nr_scanned;
2641
2642				if (nr_slab == 0 && !zone_reclaimable(zone))
2643					zone->all_unreclaimable = 1;
2644			}
2645
2646			/*
2647			 * If we've done a decent amount of scanning and
2648			 * the reclaim ratio is low, start doing writepage
2649			 * even in laptop mode
2650			 */
2651			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
2652			    total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2653				sc.may_writepage = 1;
2654
2655			if (zone->all_unreclaimable) {
2656				if (end_zone && end_zone == i)
2657					end_zone--;
2658				continue;
2659			}
2660
2661			if (!zone_watermark_ok_safe(zone, order,
2662					high_wmark_pages(zone), end_zone, 0)) {
2663				all_zones_ok = 0;
2664				/*
2665				 * We are still under min water mark.  This
2666				 * means that we have a GFP_ATOMIC allocation
2667				 * failure risk. Hurry up!
2668				 */
2669				if (!zone_watermark_ok_safe(zone, order,
2670					    min_wmark_pages(zone), end_zone, 0))
2671					has_under_min_watermark_zone = 1;
2672			} else {
2673				/*
2674				 * If a zone reaches its high watermark,
2675				 * consider it to be no longer congested. It's
2676				 * possible there are dirty pages backed by
2677				 * congested BDIs but as pressure is relieved,
2678				 * spectulatively avoid congestion waits
2679				 */
2680				zone_clear_flag(zone, ZONE_CONGESTED);
2681				if (i <= *classzone_idx)
2682					balanced += zone->present_pages;
2683			}
2684
2685		}
2686		if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2687			break;		/* kswapd: all done */
2688		/*
2689		 * OK, kswapd is getting into trouble.  Take a nap, then take
2690		 * another pass across the zones.
2691		 */
2692		if (total_scanned && (priority < DEF_PRIORITY - 2)) {
2693			if (has_under_min_watermark_zone)
2694				count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
2695			else
2696				congestion_wait(BLK_RW_ASYNC, HZ/10);
2697		}
2698
2699		/*
2700		 * We do this so kswapd doesn't build up large priorities for
2701		 * example when it is freeing in parallel with allocators. It
2702		 * matches the direct reclaim path behaviour in terms of impact
2703		 * on zone->*_priority.
2704		 */
2705		if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
2706			break;
2707	}
2708out:
2709
2710	/*
2711	 * order-0: All zones must meet high watermark for a balanced node
2712	 * high-order: Balanced zones must make up at least 25% of the node
2713	 *             for the node to be balanced
2714	 */
2715	if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
2716		cond_resched();
2717
2718		try_to_freeze();
2719
2720		/*
2721		 * Fragmentation may mean that the system cannot be
2722		 * rebalanced for high-order allocations in all zones.
2723		 * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX,
2724		 * it means the zones have been fully scanned and are still
2725		 * not balanced. For high-order allocations, there is
2726		 * little point trying all over again as kswapd may
2727		 * infinite loop.
2728		 *
2729		 * Instead, recheck all watermarks at order-0 as they
2730		 * are the most important. If watermarks are ok, kswapd will go
2731		 * back to sleep. High-order users can still perform direct
2732		 * reclaim if they wish.
2733		 */
2734		if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
2735			order = sc.order = 0;
2736
2737		goto loop_again;
2738	}
2739
2740	/*
2741	 * If kswapd was reclaiming at a higher order, it has the option of
2742	 * sleeping without all zones being balanced. Before it does, it must
2743	 * ensure that the watermarks for order-0 on *all* zones are met and
2744	 * that the congestion flags are cleared. The congestion flag must
2745	 * be cleared as kswapd is the only mechanism that clears the flag
2746	 * and it is potentially going to sleep here.
2747	 */
2748	if (order) {
2749		for (i = 0; i <= end_zone; i++) {
2750			struct zone *zone = pgdat->node_zones + i;
2751
2752			if (!populated_zone(zone))
2753				continue;
2754
2755			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2756				continue;
2757
2758			/* Confirm the zone is balanced for order-0 */
2759			if (!zone_watermark_ok(zone, 0,
2760					high_wmark_pages(zone), 0, 0)) {
2761				order = sc.order = 0;
2762				goto loop_again;
2763			}
2764
2765			/* If balanced, clear the congested flag */
2766			zone_clear_flag(zone, ZONE_CONGESTED);
2767			if (i <= *classzone_idx)
2768				balanced += zone->present_pages;
2769		}
2770	}
2771
2772	/*
2773	 * Return the order we were reclaiming at so sleeping_prematurely()
2774	 * makes a decision on the order we were last reclaiming at. However,
2775	 * if another caller entered the allocator slow path while kswapd
2776	 * was awake, order will remain at the higher level
2777	 */
2778	*classzone_idx = end_zone;
2779	return order;
2780}
2781
2782static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2783{
2784	long remaining = 0;
2785	DEFINE_WAIT(wait);
2786
2787	if (freezing(current) || kthread_should_stop())
2788		return;
2789
2790	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2791
2792	/* Try to sleep for a short interval */
2793	if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2794		remaining = schedule_timeout(HZ/10);
2795		finish_wait(&pgdat->kswapd_wait, &wait);
2796		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2797	}
2798
2799	/*
2800	 * After a short sleep, check if it was a premature sleep. If not, then
2801	 * go fully to sleep until explicitly woken up.
2802	 */
2803	if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2804		trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2805
2806		/*
2807		 * vmstat counters are not perfectly accurate and the estimated
2808		 * value for counters such as NR_FREE_PAGES can deviate from the
2809		 * true value by nr_online_cpus * threshold. To avoid the zone
2810		 * watermarks being breached while under pressure, we reduce the
2811		 * per-cpu vmstat threshold while kswapd is awake and restore
2812		 * them before going back to sleep.
2813		 */
2814		set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2815		schedule();
2816		set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
2817	} else {
2818		if (remaining)
2819			count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2820		else
2821			count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
2822	}
2823	finish_wait(&pgdat->kswapd_wait, &wait);
2824}
2825
2826/*
2827 * The background pageout daemon, started as a kernel thread
2828 * from the init process.
2829 *
2830 * This basically trickles out pages so that we have _some_
2831 * free memory available even if there is no other activity
2832 * that frees anything up. This is needed for things like routing
2833 * etc, where we otherwise might have all activity going on in
2834 * asynchronous contexts that cannot page things out.
2835 *
2836 * If there are applications that are active memory-allocators
2837 * (most normal use), this basically shouldn't matter.
2838 */
2839static int kswapd(void *p)
2840{
2841	unsigned long order, new_order;
2842	unsigned balanced_order;
2843	int classzone_idx, new_classzone_idx;
2844	int balanced_classzone_idx;
2845	pg_data_t *pgdat = (pg_data_t*)p;
2846	struct task_struct *tsk = current;
2847
2848	struct reclaim_state reclaim_state = {
2849		.reclaimed_slab = 0,
2850	};
2851	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
2852
2853	lockdep_set_current_reclaim_state(GFP_KERNEL);
2854
2855	if (!cpumask_empty(cpumask))
2856		set_cpus_allowed_ptr(tsk, cpumask);
2857	current->reclaim_state = &reclaim_state;
2858
2859	/*
2860	 * Tell the memory management that we're a "memory allocator",
2861	 * and that if we need more memory we should get access to it
2862	 * regardless (see "__alloc_pages()"). "kswapd" should
2863	 * never get caught in the normal page freeing logic.
2864	 *
2865	 * (Kswapd normally doesn't need memory anyway, but sometimes
2866	 * you need a small amount of memory in order to be able to
2867	 * page out something else, and this flag essentially protects
2868	 * us from recursively trying to free more memory as we're
2869	 * trying to free the first piece of memory in the first place).
2870	 */
2871	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2872	set_freezable();
2873
2874	order = new_order = 0;
2875	balanced_order = 0;
2876	classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
2877	balanced_classzone_idx = classzone_idx;
2878	for ( ; ; ) {
2879		int ret;
2880
2881		/*
2882		 * If the last balance_pgdat was unsuccessful it's unlikely a
2883		 * new request of a similar or harder type will succeed soon
2884		 * so consider going to sleep on the basis we reclaimed at
2885		 */
2886		if (balanced_classzone_idx >= new_classzone_idx &&
2887					balanced_order == new_order) {
2888			new_order = pgdat->kswapd_max_order;
2889			new_classzone_idx = pgdat->classzone_idx;
2890			pgdat->kswapd_max_order =  0;
2891			pgdat->classzone_idx = pgdat->nr_zones - 1;
2892		}
2893
2894		if (order < new_order || classzone_idx > new_classzone_idx) {
2895			/*
2896			 * Don't sleep if someone wants a larger 'order'
2897			 * allocation or has tigher zone constraints
2898			 */
2899			order = new_order;
2900			classzone_idx = new_classzone_idx;
2901		} else {
2902			kswapd_try_to_sleep(pgdat, balanced_order,
2903						balanced_classzone_idx);
2904			order = pgdat->kswapd_max_order;
2905			classzone_idx = pgdat->classzone_idx;
2906			new_order = order;
2907			new_classzone_idx = classzone_idx;
2908			pgdat->kswapd_max_order = 0;
2909			pgdat->classzone_idx = pgdat->nr_zones - 1;
2910		}
2911
2912		ret = try_to_freeze();
2913		if (kthread_should_stop())
2914			break;
2915
2916		/*
2917		 * We can speed up thawing tasks if we don't call balance_pgdat
2918		 * after returning from the refrigerator
2919		 */
2920		if (!ret) {
2921			trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
2922			balanced_classzone_idx = classzone_idx;
2923			balanced_order = balance_pgdat(pgdat, order,
2924						&balanced_classzone_idx);
2925		}
2926	}
2927	return 0;
2928}
2929
2930/*
2931 * A zone is low on free memory, so wake its kswapd task to service it.
2932 */
2933void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
2934{
2935	pg_data_t *pgdat;
2936
2937	if (!populated_zone(zone))
2938		return;
2939
2940	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2941		return;
2942	pgdat = zone->zone_pgdat;
2943	if (pgdat->kswapd_max_order < order) {
2944		pgdat->kswapd_max_order = order;
2945		pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
2946	}
2947	if (!waitqueue_active(&pgdat->kswapd_wait))
2948		return;
2949	if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
2950		return;
2951
2952	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2953	wake_up_interruptible(&pgdat->kswapd_wait);
2954}
2955
2956/*
2957 * The reclaimable count would be mostly accurate.
2958 * The less reclaimable pages may be
2959 * - mlocked pages, which will be moved to unevictable list when encountered
2960 * - mapped pages, which may require several travels to be reclaimed
2961 * - dirty pages, which is not "instantly" reclaimable
2962 */
2963unsigned long global_reclaimable_pages(void)
2964{
2965	int nr;
2966
2967	nr = global_page_state(NR_ACTIVE_FILE) +
2968	     global_page_state(NR_INACTIVE_FILE);
2969
2970	if (nr_swap_pages > 0)
2971		nr += global_page_state(NR_ACTIVE_ANON) +
2972		      global_page_state(NR_INACTIVE_ANON);
2973
2974	return nr;
2975}
2976
2977unsigned long zone_reclaimable_pages(struct zone *zone)
2978{
2979	int nr;
2980
2981	nr = zone_page_state(zone, NR_ACTIVE_FILE) +
2982	     zone_page_state(zone, NR_INACTIVE_FILE);
2983
2984	if (nr_swap_pages > 0)
2985		nr += zone_page_state(zone, NR_ACTIVE_ANON) +
2986		      zone_page_state(zone, NR_INACTIVE_ANON);
2987
2988	return nr;
2989}
2990
2991#ifdef CONFIG_HIBERNATION
2992/*
2993 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
2994 * freed pages.
2995 *
2996 * Rather than trying to age LRUs the aim is to preserve the overall
2997 * LRU order by reclaiming preferentially
2998 * inactive > active > active referenced > active mapped
2999 */
3000unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3001{
3002	struct reclaim_state reclaim_state;
3003	struct scan_control sc = {
3004		.gfp_mask = GFP_HIGHUSER_MOVABLE,
3005		.may_swap = 1,
3006		.may_unmap = 1,
3007		.may_writepage = 1,
3008		.nr_to_reclaim = nr_to_reclaim,
3009		.hibernation_mode = 1,
3010		.order = 0,
3011	};
3012	struct shrink_control shrink = {
3013		.gfp_mask = sc.gfp_mask,
3014	};
3015	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
3016	struct task_struct *p = current;
3017	unsigned long nr_reclaimed;
3018
3019	p->flags |= PF_MEMALLOC;
3020	lockdep_set_current_reclaim_state(sc.gfp_mask);
3021	reclaim_state.reclaimed_slab = 0;
3022	p->reclaim_state = &reclaim_state;
3023
3024	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
3025
3026	p->reclaim_state = NULL;
3027	lockdep_clear_current_reclaim_state();
3028	p->flags &= ~PF_MEMALLOC;
3029
3030	return nr_reclaimed;
3031}
3032#endif /* CONFIG_HIBERNATION */
3033
3034/* It's optimal to keep kswapds on the same CPUs as their memory, but
3035   not required for correctness.  So if the last cpu in a node goes
3036   away, we get changed to run anywhere: as the first one comes back,
3037   restore their cpu bindings. */
3038static int __devinit cpu_callback(struct notifier_block *nfb,
3039				  unsigned long action, void *hcpu)
3040{
3041	int nid;
3042
3043	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
3044		for_each_node_state(nid, N_HIGH_MEMORY) {
3045			pg_data_t *pgdat = NODE_DATA(nid);
3046			const struct cpumask *mask;
3047
3048			mask = cpumask_of_node(pgdat->node_id);
3049
3050			if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
3051				/* One of our CPUs online: restore mask */
3052				set_cpus_allowed_ptr(pgdat->kswapd, mask);
3053		}
3054	}
3055	return NOTIFY_OK;
3056}
3057
3058/*
3059 * This kswapd start function will be called by init and node-hot-add.
3060 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
3061 */
3062int kswapd_run(int nid)
3063{
3064	pg_data_t *pgdat = NODE_DATA(nid);
3065	int ret = 0;
3066
3067	if (pgdat->kswapd)
3068		return 0;
3069
3070	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
3071	if (IS_ERR(pgdat->kswapd)) {
3072		/* failure at boot is fatal */
3073		BUG_ON(system_state == SYSTEM_BOOTING);
3074		printk("Failed to start kswapd on node %d\n",nid);
3075		ret = -1;
3076	}
3077	return ret;
3078}
3079
3080/*
3081 * Called by memory hotplug when all memory in a node is offlined.
3082 */
3083void kswapd_stop(int nid)
3084{
3085	struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
3086
3087	if (kswapd)
3088		kthread_stop(kswapd);
3089}
3090
3091static int __init kswapd_init(void)
3092{
3093	int nid;
3094
3095	swap_setup();
3096	for_each_node_state(nid, N_HIGH_MEMORY)
3097 		kswapd_run(nid);
3098	hotcpu_notifier(cpu_callback, 0);
3099	return 0;
3100}
3101
3102module_init(kswapd_init)
3103
3104#ifdef CONFIG_NUMA
3105/*
3106 * Zone reclaim mode
3107 *
3108 * If non-zero call zone_reclaim when the number of free pages falls below
3109 * the watermarks.
3110 */
3111int zone_reclaim_mode __read_mostly;
3112
3113#define RECLAIM_OFF 0
3114#define RECLAIM_ZONE (1<<0)	/* Run shrink_inactive_list on the zone */
3115#define RECLAIM_WRITE (1<<1)	/* Writeout pages during reclaim */
3116#define RECLAIM_SWAP (1<<2)	/* Swap pages out during reclaim */
3117
3118/*
3119 * Priority for ZONE_RECLAIM. This determines the fraction of pages
3120 * of a node considered for each zone_reclaim. 4 scans 1/16th of
3121 * a zone.
3122 */
3123#define ZONE_RECLAIM_PRIORITY 4
3124
3125/*
3126 * Percentage of pages in a zone that must be unmapped for zone_reclaim to
3127 * occur.
3128 */
3129int sysctl_min_unmapped_ratio = 1;
3130
3131/*
3132 * If the number of slab pages in a zone grows beyond this percentage then
3133 * slab reclaim needs to occur.
3134 */
3135int sysctl_min_slab_ratio = 5;
3136
3137static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
3138{
3139	unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
3140	unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
3141		zone_page_state(zone, NR_ACTIVE_FILE);
3142
3143	/*
3144	 * It's possible for there to be more file mapped pages than
3145	 * accounted for by the pages on the file LRU lists because
3146	 * tmpfs pages accounted for as ANON can also be FILE_MAPPED
3147	 */
3148	return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
3149}
3150
3151/* Work out how many page cache pages we can reclaim in this reclaim_mode */
3152static long zone_pagecache_reclaimable(struct zone *zone)
3153{
3154	long nr_pagecache_reclaimable;
3155	long delta = 0;
3156
3157	/*
3158	 * If RECLAIM_SWAP is set, then all file pages are considered
3159	 * potentially reclaimable. Otherwise, we have to worry about
3160	 * pages like swapcache and zone_unmapped_file_pages() provides
3161	 * a better estimate
3162	 */
3163	if (zone_reclaim_mode & RECLAIM_SWAP)
3164		nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
3165	else
3166		nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
3167
3168	/* If we can't clean pages, remove dirty pages from consideration */
3169	if (!(zone_reclaim_mode & RECLAIM_WRITE))
3170		delta += zone_page_state(zone, NR_FILE_DIRTY);
3171
3172	/* Watch for any possible underflows due to delta */
3173	if (unlikely(delta > nr_pagecache_reclaimable))
3174		delta = nr_pagecache_reclaimable;
3175
3176	return nr_pagecache_reclaimable - delta;
3177}
3178
3179/*
3180 * Try to free up some pages from this zone through reclaim.
3181 */
3182static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3183{
3184	/* Minimum pages needed in order to stay on node */
3185	const unsigned long nr_pages = 1 << order;
3186	struct task_struct *p = current;
3187	struct reclaim_state reclaim_state;
3188	int priority;
3189	struct scan_control sc = {
3190		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
3191		.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
3192		.may_swap = 1,
3193		.nr_to_reclaim = max_t(unsigned long, nr_pages,
3194				       SWAP_CLUSTER_MAX),
3195		.gfp_mask = gfp_mask,
3196		.order = order,
3197	};
3198	struct shrink_control shrink = {
3199		.gfp_mask = sc.gfp_mask,
3200	};
3201	unsigned long nr_slab_pages0, nr_slab_pages1;
3202
3203	cond_resched();
3204	/*
3205	 * We need to be able to allocate from the reserves for RECLAIM_SWAP
3206	 * and we also need to be able to write out pages for RECLAIM_WRITE
3207	 * and RECLAIM_SWAP.
3208	 */
3209	p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
3210	lockdep_set_current_reclaim_state(gfp_mask);
3211	reclaim_state.reclaimed_slab = 0;
3212	p->reclaim_state = &reclaim_state;
3213
3214	if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
3215		/*
3216		 * Free memory by calling shrink zone with increasing
3217		 * priorities until we have enough memory freed.
3218		 */
3219		priority = ZONE_RECLAIM_PRIORITY;
3220		do {
3221			shrink_zone(priority, zone, &sc);
3222			priority--;
3223		} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
3224	}
3225
3226	nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3227	if (nr_slab_pages0 > zone->min_slab_pages) {
3228		/*
3229		 * shrink_slab() does not currently allow us to determine how
3230		 * many pages were freed in this zone. So we take the current
3231		 * number of slab pages and shake the slab until it is reduced
3232		 * by the same nr_pages that we used for reclaiming unmapped
3233		 * pages.
3234		 *
3235		 * Note that shrink_slab will free memory on all zones and may
3236		 * take a long time.
3237		 */
3238		for (;;) {
3239			unsigned long lru_pages = zone_reclaimable_pages(zone);
3240
3241			/* No reclaimable slab or very low memory pressure */
3242			if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
3243				break;
3244
3245			/* Freed enough memory */
3246			nr_slab_pages1 = zone_page_state(zone,
3247							NR_SLAB_RECLAIMABLE);
3248			if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
3249				break;
3250		}
3251
3252		/*
3253		 * Update nr_reclaimed by the number of slab pages we
3254		 * reclaimed from this zone.
3255		 */
3256		nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3257		if (nr_slab_pages1 < nr_slab_pages0)
3258			sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
3259	}
3260
3261	p->reclaim_state = NULL;
3262	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
3263	lockdep_clear_current_reclaim_state();
3264	return sc.nr_reclaimed >= nr_pages;
3265}
3266
3267int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3268{
3269	int node_id;
3270	int ret;
3271
3272	/*
3273	 * Zone reclaim reclaims unmapped file backed pages and
3274	 * slab pages if we are over the defined limits.
3275	 *
3276	 * A small portion of unmapped file backed pages is needed for
3277	 * file I/O otherwise pages read by file I/O will be immediately
3278	 * thrown out if the zone is overallocated. So we do not reclaim
3279	 * if less than a specified percentage of the zone is used by
3280	 * unmapped file backed pages.
3281	 */
3282	if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
3283	    zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
3284		return ZONE_RECLAIM_FULL;
3285
3286	if (zone->all_unreclaimable)
3287		return ZONE_RECLAIM_FULL;
3288
3289	/*
3290	 * Do not scan if the allocation should not be delayed.
3291	 */
3292	if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
3293		return ZONE_RECLAIM_NOSCAN;
3294
3295	/*
3296	 * Only run zone reclaim on the local zone or on zones that do not
3297	 * have associated processors. This will favor the local processor
3298	 * over remote processors and spread off node memory allocations
3299	 * as wide as possible.
3300	 */
3301	node_id = zone_to_nid(zone);
3302	if (node_state(node_id, N_CPU) && node_id != numa_node_id())
3303		return ZONE_RECLAIM_NOSCAN;
3304
3305	if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
3306		return ZONE_RECLAIM_NOSCAN;
3307
3308	ret = __zone_reclaim(zone, gfp_mask, order);
3309	zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
3310
3311	if (!ret)
3312		count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
3313
3314	return ret;
3315}
3316#endif
3317
3318/*
3319 * page_evictable - test whether a page is evictable
3320 * @page: the page to test
3321 * @vma: the VMA in which the page is or will be mapped, may be NULL
3322 *
3323 * Test whether page is evictable--i.e., should be placed on active/inactive
3324 * lists vs unevictable list.  The vma argument is !NULL when called from the
3325 * fault path to determine how to instantate a new page.
3326 *
3327 * Reasons page might not be evictable:
3328 * (1) page's mapping marked unevictable
3329 * (2) page is part of an mlocked VMA
3330 *
3331 */
3332int page_evictable(struct page *page, struct vm_area_struct *vma)
3333{
3334
3335	if (mapping_unevictable(page_mapping(page)))
3336		return 0;
3337
3338	if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
3339		return 0;
3340
3341	return 1;
3342}
3343
3344/**
3345 * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list
3346 * @page: page to check evictability and move to appropriate lru list
3347 * @zone: zone page is in
3348 *
3349 * Checks a page for evictability and moves the page to the appropriate
3350 * zone lru list.
3351 *
3352 * Restrictions: zone->lru_lock must be held, page must be on LRU and must
3353 * have PageUnevictable set.
3354 */
3355static void check_move_unevictable_page(struct page *page, struct zone *zone)
3356{
3357	VM_BUG_ON(PageActive(page));
3358
3359retry:
3360	ClearPageUnevictable(page);
3361	if (page_evictable(page, NULL)) {
3362		enum lru_list l = page_lru_base_type(page);
3363
3364		__dec_zone_state(zone, NR_UNEVICTABLE);
3365		list_move(&page->lru, &zone->lru[l].list);
3366		mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
3367		__inc_zone_state(zone, NR_INACTIVE_ANON + l);
3368		__count_vm_event(UNEVICTABLE_PGRESCUED);
3369	} else {
3370		/*
3371		 * rotate unevictable list
3372		 */
3373		SetPageUnevictable(page);
3374		list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
3375		mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
3376		if (page_evictable(page, NULL))
3377			goto retry;
3378	}
3379}
3380
3381/**
3382 * scan_mapping_unevictable_pages - scan an address space for evictable pages
3383 * @mapping: struct address_space to scan for evictable pages
3384 *
3385 * Scan all pages in mapping.  Check unevictable pages for
3386 * evictability and move them to the appropriate zone lru list.
3387 */
3388void scan_mapping_unevictable_pages(struct address_space *mapping)
3389{
3390	pgoff_t next = 0;
3391	pgoff_t end   = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
3392			 PAGE_CACHE_SHIFT;
3393	struct zone *zone;
3394	struct pagevec pvec;
3395
3396	if (mapping->nrpages == 0)
3397		return;
3398
3399	pagevec_init(&pvec, 0);
3400	while (next < end &&
3401		pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
3402		int i;
3403		int pg_scanned = 0;
3404
3405		zone = NULL;
3406
3407		for (i = 0; i < pagevec_count(&pvec); i++) {
3408			struct page *page = pvec.pages[i];
3409			pgoff_t page_index = page->index;
3410			struct zone *pagezone = page_zone(page);
3411
3412			pg_scanned++;
3413			if (page_index > next)
3414				next = page_index;
3415			next++;
3416
3417			if (pagezone != zone) {
3418				if (zone)
3419					spin_unlock_irq(&zone->lru_lock);
3420				zone = pagezone;
3421				spin_lock_irq(&zone->lru_lock);
3422			}
3423
3424			if (PageLRU(page) && PageUnevictable(page))
3425				check_move_unevictable_page(page, zone);
3426		}
3427		if (zone)
3428			spin_unlock_irq(&zone->lru_lock);
3429		pagevec_release(&pvec);
3430
3431		count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
3432	}
3433
3434}
3435
3436static void warn_scan_unevictable_pages(void)
3437{
3438	printk_once(KERN_WARNING
3439		    "The scan_unevictable_pages sysctl/node-interface has been "
3440		    "disabled for lack of a legitimate use case.  If you have "
3441		    "one, please send an email to linux-mm@kvack.org.\n");
3442}
3443
3444/*
3445 * scan_unevictable_pages [vm] sysctl handler.  On demand re-scan of
3446 * all nodes' unevictable lists for evictable pages
3447 */
3448unsigned long scan_unevictable_pages;
3449
3450int scan_unevictable_handler(struct ctl_table *table, int write,
3451			   void __user *buffer,
3452			   size_t *length, loff_t *ppos)
3453{
3454	warn_scan_unevictable_pages();
3455	proc_doulongvec_minmax(table, write, buffer, length, ppos);
3456	scan_unevictable_pages = 0;
3457	return 0;
3458}
3459
3460#ifdef CONFIG_NUMA
3461/*
3462 * per node 'scan_unevictable_pages' attribute.  On demand re-scan of
3463 * a specified node's per zone unevictable lists for evictable pages.
3464 */
3465
3466static ssize_t read_scan_unevictable_node(struct device *dev,
3467					  struct device_attribute *attr,
3468					  char *buf)
3469{
3470	warn_scan_unevictable_pages();
3471	return sprintf(buf, "0\n");	/* always zero; should fit... */
3472}
3473
3474static ssize_t write_scan_unevictable_node(struct device *dev,
3475					   struct device_attribute *attr,
3476					const char *buf, size_t count)
3477{
3478	warn_scan_unevictable_pages();
3479	return 1;
3480}
3481
3482
3483static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
3484			read_scan_unevictable_node,
3485			write_scan_unevictable_node);
3486
3487int scan_unevictable_register_node(struct node *node)
3488{
3489	return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
3490}
3491
3492void scan_unevictable_unregister_node(struct node *node)
3493{
3494	device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages);
3495}
3496#endif
3497