rmap.c revision 34bbd704051c9d053d69e90569a3a2365f4c7b50
1/*
2 * mm/rmap.c - physical to virtual reverse mappings
3 *
4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
5 * Released under the General Public License (GPL).
6 *
7 * Simple, low overhead reverse mapping scheme.
8 * Please try to keep this thing as modular as possible.
9 *
10 * Provides methods for unmapping each kind of mapped page:
11 * the anon methods track anonymous pages, and
12 * the file methods track pages belonging to an inode.
13 *
14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001
15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
17 * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
18 */
19
20/*
21 * Lock ordering in mm:
22 *
23 * inode->i_mutex	(while writing or truncating, not reading or faulting)
24 *   inode->i_alloc_sem (vmtruncate_range)
25 *   mm->mmap_sem
26 *     page->flags PG_locked (lock_page)
27 *       mapping->i_mmap_lock
28 *         anon_vma->lock
29 *           mm->page_table_lock or pte_lock
30 *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)
31 *             swap_lock (in swap_duplicate, swap_info_get)
32 *               mmlist_lock (in mmput, drain_mmlist and others)
33 *               mapping->private_lock (in __set_page_dirty_buffers)
34 *               inode_lock (in set_page_dirty's __mark_inode_dirty)
35 *                 sb_lock (within inode_lock in fs/fs-writeback.c)
36 *                 mapping->tree_lock (widely used, in set_page_dirty,
37 *                           in arch-dependent flush_dcache_mmap_lock,
38 *                           within inode_lock in __sync_single_inode)
39 */
40
41#include <linux/mm.h>
42#include <linux/pagemap.h>
43#include <linux/swap.h>
44#include <linux/swapops.h>
45#include <linux/slab.h>
46#include <linux/init.h>
47#include <linux/rmap.h>
48#include <linux/rcupdate.h>
49#include <linux/module.h>
50#include <linux/kallsyms.h>
51
52#include <asm/tlbflush.h>
53
54struct kmem_cache *anon_vma_cachep;
55
56static inline void validate_anon_vma(struct vm_area_struct *find_vma)
57{
58#ifdef CONFIG_DEBUG_VM
59	struct anon_vma *anon_vma = find_vma->anon_vma;
60	struct vm_area_struct *vma;
61	unsigned int mapcount = 0;
62	int found = 0;
63
64	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
65		mapcount++;
66		BUG_ON(mapcount > 100000);
67		if (vma == find_vma)
68			found = 1;
69	}
70	BUG_ON(!found);
71#endif
72}
73
74/* This must be called under the mmap_sem. */
75int anon_vma_prepare(struct vm_area_struct *vma)
76{
77	struct anon_vma *anon_vma = vma->anon_vma;
78
79	might_sleep();
80	if (unlikely(!anon_vma)) {
81		struct mm_struct *mm = vma->vm_mm;
82		struct anon_vma *allocated, *locked;
83
84		anon_vma = find_mergeable_anon_vma(vma);
85		if (anon_vma) {
86			allocated = NULL;
87			locked = anon_vma;
88			spin_lock(&locked->lock);
89		} else {
90			anon_vma = anon_vma_alloc();
91			if (unlikely(!anon_vma))
92				return -ENOMEM;
93			allocated = anon_vma;
94			locked = NULL;
95		}
96
97		/* page_table_lock to protect against threads */
98		spin_lock(&mm->page_table_lock);
99		if (likely(!vma->anon_vma)) {
100			vma->anon_vma = anon_vma;
101			list_add_tail(&vma->anon_vma_node, &anon_vma->head);
102			allocated = NULL;
103		}
104		spin_unlock(&mm->page_table_lock);
105
106		if (locked)
107			spin_unlock(&locked->lock);
108		if (unlikely(allocated))
109			anon_vma_free(allocated);
110	}
111	return 0;
112}
113
114void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
115{
116	BUG_ON(vma->anon_vma != next->anon_vma);
117	list_del(&next->anon_vma_node);
118}
119
120void __anon_vma_link(struct vm_area_struct *vma)
121{
122	struct anon_vma *anon_vma = vma->anon_vma;
123
124	if (anon_vma) {
125		list_add_tail(&vma->anon_vma_node, &anon_vma->head);
126		validate_anon_vma(vma);
127	}
128}
129
130void anon_vma_link(struct vm_area_struct *vma)
131{
132	struct anon_vma *anon_vma = vma->anon_vma;
133
134	if (anon_vma) {
135		spin_lock(&anon_vma->lock);
136		list_add_tail(&vma->anon_vma_node, &anon_vma->head);
137		validate_anon_vma(vma);
138		spin_unlock(&anon_vma->lock);
139	}
140}
141
142void anon_vma_unlink(struct vm_area_struct *vma)
143{
144	struct anon_vma *anon_vma = vma->anon_vma;
145	int empty;
146
147	if (!anon_vma)
148		return;
149
150	spin_lock(&anon_vma->lock);
151	validate_anon_vma(vma);
152	list_del(&vma->anon_vma_node);
153
154	/* We must garbage collect the anon_vma if it's empty */
155	empty = list_empty(&anon_vma->head);
156	spin_unlock(&anon_vma->lock);
157
158	if (empty)
159		anon_vma_free(anon_vma);
160}
161
162static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
163			  unsigned long flags)
164{
165	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
166						SLAB_CTOR_CONSTRUCTOR) {
167		struct anon_vma *anon_vma = data;
168
169		spin_lock_init(&anon_vma->lock);
170		INIT_LIST_HEAD(&anon_vma->head);
171	}
172}
173
174void __init anon_vma_init(void)
175{
176	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
177			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL);
178}
179
180/*
181 * Getting a lock on a stable anon_vma from a page off the LRU is
182 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
183 */
184static struct anon_vma *page_lock_anon_vma(struct page *page)
185{
186	struct anon_vma *anon_vma;
187	unsigned long anon_mapping;
188
189	rcu_read_lock();
190	anon_mapping = (unsigned long) page->mapping;
191	if (!(anon_mapping & PAGE_MAPPING_ANON))
192		goto out;
193	if (!page_mapped(page))
194		goto out;
195
196	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
197	spin_lock(&anon_vma->lock);
198	return anon_vma;
199out:
200	rcu_read_unlock();
201	return NULL;
202}
203
204static void page_unlock_anon_vma(struct anon_vma *anon_vma)
205{
206	spin_unlock(&anon_vma->lock);
207	rcu_read_unlock();
208}
209
210/*
211 * At what user virtual address is page expected in vma?
212 */
213static inline unsigned long
214vma_address(struct page *page, struct vm_area_struct *vma)
215{
216	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
217	unsigned long address;
218
219	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
220	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
221		/* page should be within any vma from prio_tree_next */
222		BUG_ON(!PageAnon(page));
223		return -EFAULT;
224	}
225	return address;
226}
227
228/*
229 * At what user virtual address is page expected in vma? checking that the
230 * page matches the vma: currently only used on anon pages, by unuse_vma;
231 */
232unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
233{
234	if (PageAnon(page)) {
235		if ((void *)vma->anon_vma !=
236		    (void *)page->mapping - PAGE_MAPPING_ANON)
237			return -EFAULT;
238	} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
239		if (!vma->vm_file ||
240		    vma->vm_file->f_mapping != page->mapping)
241			return -EFAULT;
242	} else
243		return -EFAULT;
244	return vma_address(page, vma);
245}
246
247/*
248 * Check that @page is mapped at @address into @mm.
249 *
250 * On success returns with pte mapped and locked.
251 */
252pte_t *page_check_address(struct page *page, struct mm_struct *mm,
253			  unsigned long address, spinlock_t **ptlp)
254{
255	pgd_t *pgd;
256	pud_t *pud;
257	pmd_t *pmd;
258	pte_t *pte;
259	spinlock_t *ptl;
260
261	pgd = pgd_offset(mm, address);
262	if (!pgd_present(*pgd))
263		return NULL;
264
265	pud = pud_offset(pgd, address);
266	if (!pud_present(*pud))
267		return NULL;
268
269	pmd = pmd_offset(pud, address);
270	if (!pmd_present(*pmd))
271		return NULL;
272
273	pte = pte_offset_map(pmd, address);
274	/* Make a quick check before getting the lock */
275	if (!pte_present(*pte)) {
276		pte_unmap(pte);
277		return NULL;
278	}
279
280	ptl = pte_lockptr(mm, pmd);
281	spin_lock(ptl);
282	if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
283		*ptlp = ptl;
284		return pte;
285	}
286	pte_unmap_unlock(pte, ptl);
287	return NULL;
288}
289
290/*
291 * Subfunctions of page_referenced: page_referenced_one called
292 * repeatedly from either page_referenced_anon or page_referenced_file.
293 */
294static int page_referenced_one(struct page *page,
295	struct vm_area_struct *vma, unsigned int *mapcount)
296{
297	struct mm_struct *mm = vma->vm_mm;
298	unsigned long address;
299	pte_t *pte;
300	spinlock_t *ptl;
301	int referenced = 0;
302
303	address = vma_address(page, vma);
304	if (address == -EFAULT)
305		goto out;
306
307	pte = page_check_address(page, mm, address, &ptl);
308	if (!pte)
309		goto out;
310
311	if (ptep_clear_flush_young(vma, address, pte))
312		referenced++;
313
314	/* Pretend the page is referenced if the task has the
315	   swap token and is in the middle of a page fault. */
316	if (mm != current->mm && has_swap_token(mm) &&
317			rwsem_is_locked(&mm->mmap_sem))
318		referenced++;
319
320	(*mapcount)--;
321	pte_unmap_unlock(pte, ptl);
322out:
323	return referenced;
324}
325
326static int page_referenced_anon(struct page *page)
327{
328	unsigned int mapcount;
329	struct anon_vma *anon_vma;
330	struct vm_area_struct *vma;
331	int referenced = 0;
332
333	anon_vma = page_lock_anon_vma(page);
334	if (!anon_vma)
335		return referenced;
336
337	mapcount = page_mapcount(page);
338	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
339		referenced += page_referenced_one(page, vma, &mapcount);
340		if (!mapcount)
341			break;
342	}
343
344	page_unlock_anon_vma(anon_vma);
345	return referenced;
346}
347
348/**
349 * page_referenced_file - referenced check for object-based rmap
350 * @page: the page we're checking references on.
351 *
352 * For an object-based mapped page, find all the places it is mapped and
353 * check/clear the referenced flag.  This is done by following the page->mapping
354 * pointer, then walking the chain of vmas it holds.  It returns the number
355 * of references it found.
356 *
357 * This function is only called from page_referenced for object-based pages.
358 */
359static int page_referenced_file(struct page *page)
360{
361	unsigned int mapcount;
362	struct address_space *mapping = page->mapping;
363	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
364	struct vm_area_struct *vma;
365	struct prio_tree_iter iter;
366	int referenced = 0;
367
368	/*
369	 * The caller's checks on page->mapping and !PageAnon have made
370	 * sure that this is a file page: the check for page->mapping
371	 * excludes the case just before it gets set on an anon page.
372	 */
373	BUG_ON(PageAnon(page));
374
375	/*
376	 * The page lock not only makes sure that page->mapping cannot
377	 * suddenly be NULLified by truncation, it makes sure that the
378	 * structure at mapping cannot be freed and reused yet,
379	 * so we can safely take mapping->i_mmap_lock.
380	 */
381	BUG_ON(!PageLocked(page));
382
383	spin_lock(&mapping->i_mmap_lock);
384
385	/*
386	 * i_mmap_lock does not stabilize mapcount at all, but mapcount
387	 * is more likely to be accurate if we note it after spinning.
388	 */
389	mapcount = page_mapcount(page);
390
391	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
392		if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
393				  == (VM_LOCKED|VM_MAYSHARE)) {
394			referenced++;
395			break;
396		}
397		referenced += page_referenced_one(page, vma, &mapcount);
398		if (!mapcount)
399			break;
400	}
401
402	spin_unlock(&mapping->i_mmap_lock);
403	return referenced;
404}
405
406/**
407 * page_referenced - test if the page was referenced
408 * @page: the page to test
409 * @is_locked: caller holds lock on the page
410 *
411 * Quick test_and_clear_referenced for all mappings to a page,
412 * returns the number of ptes which referenced the page.
413 */
414int page_referenced(struct page *page, int is_locked)
415{
416	int referenced = 0;
417
418	if (page_test_and_clear_young(page))
419		referenced++;
420
421	if (TestClearPageReferenced(page))
422		referenced++;
423
424	if (page_mapped(page) && page->mapping) {
425		if (PageAnon(page))
426			referenced += page_referenced_anon(page);
427		else if (is_locked)
428			referenced += page_referenced_file(page);
429		else if (TestSetPageLocked(page))
430			referenced++;
431		else {
432			if (page->mapping)
433				referenced += page_referenced_file(page);
434			unlock_page(page);
435		}
436	}
437	return referenced;
438}
439
440static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
441{
442	struct mm_struct *mm = vma->vm_mm;
443	unsigned long address;
444	pte_t *pte;
445	spinlock_t *ptl;
446	int ret = 0;
447
448	address = vma_address(page, vma);
449	if (address == -EFAULT)
450		goto out;
451
452	pte = page_check_address(page, mm, address, &ptl);
453	if (!pte)
454		goto out;
455
456	if (pte_dirty(*pte) || pte_write(*pte)) {
457		pte_t entry;
458
459		flush_cache_page(vma, address, pte_pfn(*pte));
460		entry = ptep_clear_flush(vma, address, pte);
461		entry = pte_wrprotect(entry);
462		entry = pte_mkclean(entry);
463		set_pte_at(mm, address, pte, entry);
464		lazy_mmu_prot_update(entry);
465		ret = 1;
466	}
467
468	pte_unmap_unlock(pte, ptl);
469out:
470	return ret;
471}
472
473static int page_mkclean_file(struct address_space *mapping, struct page *page)
474{
475	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
476	struct vm_area_struct *vma;
477	struct prio_tree_iter iter;
478	int ret = 0;
479
480	BUG_ON(PageAnon(page));
481
482	spin_lock(&mapping->i_mmap_lock);
483	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
484		if (vma->vm_flags & VM_SHARED)
485			ret += page_mkclean_one(page, vma);
486	}
487	spin_unlock(&mapping->i_mmap_lock);
488	return ret;
489}
490
491int page_mkclean(struct page *page)
492{
493	int ret = 0;
494
495	BUG_ON(!PageLocked(page));
496
497	if (page_mapped(page)) {
498		struct address_space *mapping = page_mapping(page);
499		if (mapping)
500			ret = page_mkclean_file(mapping, page);
501	}
502	if (page_test_and_clear_dirty(page))
503		ret = 1;
504
505	return ret;
506}
507
508/**
509 * page_set_anon_rmap - setup new anonymous rmap
510 * @page:	the page to add the mapping to
511 * @vma:	the vm area in which the mapping is added
512 * @address:	the user virtual address mapped
513 */
514static void __page_set_anon_rmap(struct page *page,
515	struct vm_area_struct *vma, unsigned long address)
516{
517	struct anon_vma *anon_vma = vma->anon_vma;
518
519	BUG_ON(!anon_vma);
520	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
521	page->mapping = (struct address_space *) anon_vma;
522
523	page->index = linear_page_index(vma, address);
524
525	/*
526	 * nr_mapped state can be updated without turning off
527	 * interrupts because it is not modified via interrupt.
528	 */
529	__inc_zone_page_state(page, NR_ANON_PAGES);
530}
531
532/**
533 * page_add_anon_rmap - add pte mapping to an anonymous page
534 * @page:	the page to add the mapping to
535 * @vma:	the vm area in which the mapping is added
536 * @address:	the user virtual address mapped
537 *
538 * The caller needs to hold the pte lock.
539 */
540void page_add_anon_rmap(struct page *page,
541	struct vm_area_struct *vma, unsigned long address)
542{
543	if (atomic_inc_and_test(&page->_mapcount))
544		__page_set_anon_rmap(page, vma, address);
545	/* else checking page index and mapping is racy */
546}
547
548/*
549 * page_add_new_anon_rmap - add pte mapping to a new anonymous page
550 * @page:	the page to add the mapping to
551 * @vma:	the vm area in which the mapping is added
552 * @address:	the user virtual address mapped
553 *
554 * Same as page_add_anon_rmap but must only be called on *new* pages.
555 * This means the inc-and-test can be bypassed.
556 */
557void page_add_new_anon_rmap(struct page *page,
558	struct vm_area_struct *vma, unsigned long address)
559{
560	atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
561	__page_set_anon_rmap(page, vma, address);
562}
563
564/**
565 * page_add_file_rmap - add pte mapping to a file page
566 * @page: the page to add the mapping to
567 *
568 * The caller needs to hold the pte lock.
569 */
570void page_add_file_rmap(struct page *page)
571{
572	if (atomic_inc_and_test(&page->_mapcount))
573		__inc_zone_page_state(page, NR_FILE_MAPPED);
574}
575
576/**
577 * page_remove_rmap - take down pte mapping from a page
578 * @page: page to remove mapping from
579 *
580 * The caller needs to hold the pte lock.
581 */
582void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
583{
584	if (atomic_add_negative(-1, &page->_mapcount)) {
585		if (unlikely(page_mapcount(page) < 0)) {
586			printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
587			printk (KERN_EMERG "  page pfn = %lx\n", page_to_pfn(page));
588			printk (KERN_EMERG "  page->flags = %lx\n", page->flags);
589			printk (KERN_EMERG "  page->count = %x\n", page_count(page));
590			printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
591			print_symbol (KERN_EMERG "  vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
592			if (vma->vm_ops)
593				print_symbol (KERN_EMERG "  vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage);
594			if (vma->vm_file && vma->vm_file->f_op)
595				print_symbol (KERN_EMERG "  vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
596			BUG();
597		}
598
599		/*
600		 * It would be tidy to reset the PageAnon mapping here,
601		 * but that might overwrite a racing page_add_anon_rmap
602		 * which increments mapcount after us but sets mapping
603		 * before us: so leave the reset to free_hot_cold_page,
604		 * and remember that it's only reliable while mapped.
605		 * Leaving it set also helps swapoff to reinstate ptes
606		 * faster for those pages still in swapcache.
607		 */
608		if (page_test_and_clear_dirty(page))
609			set_page_dirty(page);
610		__dec_zone_page_state(page,
611				PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
612	}
613}
614
615/*
616 * Subfunctions of try_to_unmap: try_to_unmap_one called
617 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
618 */
619static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
620				int migration)
621{
622	struct mm_struct *mm = vma->vm_mm;
623	unsigned long address;
624	pte_t *pte;
625	pte_t pteval;
626	spinlock_t *ptl;
627	int ret = SWAP_AGAIN;
628
629	address = vma_address(page, vma);
630	if (address == -EFAULT)
631		goto out;
632
633	pte = page_check_address(page, mm, address, &ptl);
634	if (!pte)
635		goto out;
636
637	/*
638	 * If the page is mlock()d, we cannot swap it out.
639	 * If it's recently referenced (perhaps page_referenced
640	 * skipped over this mm) then we should reactivate it.
641	 */
642	if (!migration && ((vma->vm_flags & VM_LOCKED) ||
643			(ptep_clear_flush_young(vma, address, pte)))) {
644		ret = SWAP_FAIL;
645		goto out_unmap;
646	}
647
648	/* Nuke the page table entry. */
649	flush_cache_page(vma, address, page_to_pfn(page));
650	pteval = ptep_clear_flush(vma, address, pte);
651
652	/* Move the dirty bit to the physical page now the pte is gone. */
653	if (pte_dirty(pteval))
654		set_page_dirty(page);
655
656	/* Update high watermark before we lower rss */
657	update_hiwater_rss(mm);
658
659	if (PageAnon(page)) {
660		swp_entry_t entry = { .val = page_private(page) };
661
662		if (PageSwapCache(page)) {
663			/*
664			 * Store the swap location in the pte.
665			 * See handle_pte_fault() ...
666			 */
667			swap_duplicate(entry);
668			if (list_empty(&mm->mmlist)) {
669				spin_lock(&mmlist_lock);
670				if (list_empty(&mm->mmlist))
671					list_add(&mm->mmlist, &init_mm.mmlist);
672				spin_unlock(&mmlist_lock);
673			}
674			dec_mm_counter(mm, anon_rss);
675#ifdef CONFIG_MIGRATION
676		} else {
677			/*
678			 * Store the pfn of the page in a special migration
679			 * pte. do_swap_page() will wait until the migration
680			 * pte is removed and then restart fault handling.
681			 */
682			BUG_ON(!migration);
683			entry = make_migration_entry(page, pte_write(pteval));
684#endif
685		}
686		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
687		BUG_ON(pte_file(*pte));
688	} else
689#ifdef CONFIG_MIGRATION
690	if (migration) {
691		/* Establish migration entry for a file page */
692		swp_entry_t entry;
693		entry = make_migration_entry(page, pte_write(pteval));
694		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
695	} else
696#endif
697		dec_mm_counter(mm, file_rss);
698
699
700	page_remove_rmap(page, vma);
701	page_cache_release(page);
702
703out_unmap:
704	pte_unmap_unlock(pte, ptl);
705out:
706	return ret;
707}
708
709/*
710 * objrmap doesn't work for nonlinear VMAs because the assumption that
711 * offset-into-file correlates with offset-into-virtual-addresses does not hold.
712 * Consequently, given a particular page and its ->index, we cannot locate the
713 * ptes which are mapping that page without an exhaustive linear search.
714 *
715 * So what this code does is a mini "virtual scan" of each nonlinear VMA which
716 * maps the file to which the target page belongs.  The ->vm_private_data field
717 * holds the current cursor into that scan.  Successive searches will circulate
718 * around the vma's virtual address space.
719 *
720 * So as more replacement pressure is applied to the pages in a nonlinear VMA,
721 * more scanning pressure is placed against them as well.   Eventually pages
722 * will become fully unmapped and are eligible for eviction.
723 *
724 * For very sparsely populated VMAs this is a little inefficient - chances are
725 * there there won't be many ptes located within the scan cluster.  In this case
726 * maybe we could scan further - to the end of the pte page, perhaps.
727 */
728#define CLUSTER_SIZE	min(32*PAGE_SIZE, PMD_SIZE)
729#define CLUSTER_MASK	(~(CLUSTER_SIZE - 1))
730
731static void try_to_unmap_cluster(unsigned long cursor,
732	unsigned int *mapcount, struct vm_area_struct *vma)
733{
734	struct mm_struct *mm = vma->vm_mm;
735	pgd_t *pgd;
736	pud_t *pud;
737	pmd_t *pmd;
738	pte_t *pte;
739	pte_t pteval;
740	spinlock_t *ptl;
741	struct page *page;
742	unsigned long address;
743	unsigned long end;
744
745	address = (vma->vm_start + cursor) & CLUSTER_MASK;
746	end = address + CLUSTER_SIZE;
747	if (address < vma->vm_start)
748		address = vma->vm_start;
749	if (end > vma->vm_end)
750		end = vma->vm_end;
751
752	pgd = pgd_offset(mm, address);
753	if (!pgd_present(*pgd))
754		return;
755
756	pud = pud_offset(pgd, address);
757	if (!pud_present(*pud))
758		return;
759
760	pmd = pmd_offset(pud, address);
761	if (!pmd_present(*pmd))
762		return;
763
764	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
765
766	/* Update high watermark before we lower rss */
767	update_hiwater_rss(mm);
768
769	for (; address < end; pte++, address += PAGE_SIZE) {
770		if (!pte_present(*pte))
771			continue;
772		page = vm_normal_page(vma, address, *pte);
773		BUG_ON(!page || PageAnon(page));
774
775		if (ptep_clear_flush_young(vma, address, pte))
776			continue;
777
778		/* Nuke the page table entry. */
779		flush_cache_page(vma, address, pte_pfn(*pte));
780		pteval = ptep_clear_flush(vma, address, pte);
781
782		/* If nonlinear, store the file page offset in the pte. */
783		if (page->index != linear_page_index(vma, address))
784			set_pte_at(mm, address, pte, pgoff_to_pte(page->index));
785
786		/* Move the dirty bit to the physical page now the pte is gone. */
787		if (pte_dirty(pteval))
788			set_page_dirty(page);
789
790		page_remove_rmap(page, vma);
791		page_cache_release(page);
792		dec_mm_counter(mm, file_rss);
793		(*mapcount)--;
794	}
795	pte_unmap_unlock(pte - 1, ptl);
796}
797
798static int try_to_unmap_anon(struct page *page, int migration)
799{
800	struct anon_vma *anon_vma;
801	struct vm_area_struct *vma;
802	int ret = SWAP_AGAIN;
803
804	anon_vma = page_lock_anon_vma(page);
805	if (!anon_vma)
806		return ret;
807
808	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
809		ret = try_to_unmap_one(page, vma, migration);
810		if (ret == SWAP_FAIL || !page_mapped(page))
811			break;
812	}
813
814	page_unlock_anon_vma(anon_vma);
815	return ret;
816}
817
818/**
819 * try_to_unmap_file - unmap file page using the object-based rmap method
820 * @page: the page to unmap
821 *
822 * Find all the mappings of a page using the mapping pointer and the vma chains
823 * contained in the address_space struct it points to.
824 *
825 * This function is only called from try_to_unmap for object-based pages.
826 */
827static int try_to_unmap_file(struct page *page, int migration)
828{
829	struct address_space *mapping = page->mapping;
830	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
831	struct vm_area_struct *vma;
832	struct prio_tree_iter iter;
833	int ret = SWAP_AGAIN;
834	unsigned long cursor;
835	unsigned long max_nl_cursor = 0;
836	unsigned long max_nl_size = 0;
837	unsigned int mapcount;
838
839	spin_lock(&mapping->i_mmap_lock);
840	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
841		ret = try_to_unmap_one(page, vma, migration);
842		if (ret == SWAP_FAIL || !page_mapped(page))
843			goto out;
844	}
845
846	if (list_empty(&mapping->i_mmap_nonlinear))
847		goto out;
848
849	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
850						shared.vm_set.list) {
851		if ((vma->vm_flags & VM_LOCKED) && !migration)
852			continue;
853		cursor = (unsigned long) vma->vm_private_data;
854		if (cursor > max_nl_cursor)
855			max_nl_cursor = cursor;
856		cursor = vma->vm_end - vma->vm_start;
857		if (cursor > max_nl_size)
858			max_nl_size = cursor;
859	}
860
861	if (max_nl_size == 0) {	/* any nonlinears locked or reserved */
862		ret = SWAP_FAIL;
863		goto out;
864	}
865
866	/*
867	 * We don't try to search for this page in the nonlinear vmas,
868	 * and page_referenced wouldn't have found it anyway.  Instead
869	 * just walk the nonlinear vmas trying to age and unmap some.
870	 * The mapcount of the page we came in with is irrelevant,
871	 * but even so use it as a guide to how hard we should try?
872	 */
873	mapcount = page_mapcount(page);
874	if (!mapcount)
875		goto out;
876	cond_resched_lock(&mapping->i_mmap_lock);
877
878	max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
879	if (max_nl_cursor == 0)
880		max_nl_cursor = CLUSTER_SIZE;
881
882	do {
883		list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
884						shared.vm_set.list) {
885			if ((vma->vm_flags & VM_LOCKED) && !migration)
886				continue;
887			cursor = (unsigned long) vma->vm_private_data;
888			while ( cursor < max_nl_cursor &&
889				cursor < vma->vm_end - vma->vm_start) {
890				try_to_unmap_cluster(cursor, &mapcount, vma);
891				cursor += CLUSTER_SIZE;
892				vma->vm_private_data = (void *) cursor;
893				if ((int)mapcount <= 0)
894					goto out;
895			}
896			vma->vm_private_data = (void *) max_nl_cursor;
897		}
898		cond_resched_lock(&mapping->i_mmap_lock);
899		max_nl_cursor += CLUSTER_SIZE;
900	} while (max_nl_cursor <= max_nl_size);
901
902	/*
903	 * Don't loop forever (perhaps all the remaining pages are
904	 * in locked vmas).  Reset cursor on all unreserved nonlinear
905	 * vmas, now forgetting on which ones it had fallen behind.
906	 */
907	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
908		vma->vm_private_data = NULL;
909out:
910	spin_unlock(&mapping->i_mmap_lock);
911	return ret;
912}
913
914/**
915 * try_to_unmap - try to remove all page table mappings to a page
916 * @page: the page to get unmapped
917 *
918 * Tries to remove all the page table entries which are mapping this
919 * page, used in the pageout path.  Caller must hold the page lock.
920 * Return values are:
921 *
922 * SWAP_SUCCESS	- we succeeded in removing all mappings
923 * SWAP_AGAIN	- we missed a mapping, try again later
924 * SWAP_FAIL	- the page is unswappable
925 */
926int try_to_unmap(struct page *page, int migration)
927{
928	int ret;
929
930	BUG_ON(!PageLocked(page));
931
932	if (PageAnon(page))
933		ret = try_to_unmap_anon(page, migration);
934	else
935		ret = try_to_unmap_file(page, migration);
936
937	if (!page_mapped(page))
938		ret = SWAP_SUCCESS;
939	return ret;
940}
941
942