rmap.c revision 64cdd548ffe26849d4cd113ac640f60606063b14
1/*
2 * mm/rmap.c - physical to virtual reverse mappings
3 *
4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
5 * Released under the General Public License (GPL).
6 *
7 * Simple, low overhead reverse mapping scheme.
8 * Please try to keep this thing as modular as possible.
9 *
10 * Provides methods for unmapping each kind of mapped page:
11 * the anon methods track anonymous pages, and
12 * the file methods track pages belonging to an inode.
13 *
14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001
15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
17 * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
18 */
19
20/*
21 * Lock ordering in mm:
22 *
23 * inode->i_mutex	(while writing or truncating, not reading or faulting)
24 *   inode->i_alloc_sem (vmtruncate_range)
25 *   mm->mmap_sem
26 *     page->flags PG_locked (lock_page)
27 *       mapping->i_mmap_lock
28 *         anon_vma->lock
29 *           mm->page_table_lock or pte_lock
30 *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)
31 *             swap_lock (in swap_duplicate, swap_info_get)
32 *               mmlist_lock (in mmput, drain_mmlist and others)
33 *               mapping->private_lock (in __set_page_dirty_buffers)
34 *               inode_lock (in set_page_dirty's __mark_inode_dirty)
35 *                 sb_lock (within inode_lock in fs/fs-writeback.c)
36 *                 mapping->tree_lock (widely used, in set_page_dirty,
37 *                           in arch-dependent flush_dcache_mmap_lock,
38 *                           within inode_lock in __sync_single_inode)
39 */
40
41#include <linux/mm.h>
42#include <linux/pagemap.h>
43#include <linux/swap.h>
44#include <linux/swapops.h>
45#include <linux/slab.h>
46#include <linux/init.h>
47#include <linux/rmap.h>
48#include <linux/rcupdate.h>
49#include <linux/module.h>
50#include <linux/kallsyms.h>
51#include <linux/memcontrol.h>
52#include <linux/mmu_notifier.h>
53#include <linux/migrate.h>
54
55#include <asm/tlbflush.h>
56
57#include "internal.h"
58
59static struct kmem_cache *anon_vma_cachep;
60
61static inline struct anon_vma *anon_vma_alloc(void)
62{
63	return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
64}
65
66static inline void anon_vma_free(struct anon_vma *anon_vma)
67{
68	kmem_cache_free(anon_vma_cachep, anon_vma);
69}
70
71/**
72 * anon_vma_prepare - attach an anon_vma to a memory region
73 * @vma: the memory region in question
74 *
75 * This makes sure the memory mapping described by 'vma' has
76 * an 'anon_vma' attached to it, so that we can associate the
77 * anonymous pages mapped into it with that anon_vma.
78 *
79 * The common case will be that we already have one, but if
80 * if not we either need to find an adjacent mapping that we
81 * can re-use the anon_vma from (very common when the only
82 * reason for splitting a vma has been mprotect()), or we
83 * allocate a new one.
84 *
85 * Anon-vma allocations are very subtle, because we may have
86 * optimistically looked up an anon_vma in page_lock_anon_vma()
87 * and that may actually touch the spinlock even in the newly
88 * allocated vma (it depends on RCU to make sure that the
89 * anon_vma isn't actually destroyed).
90 *
91 * As a result, we need to do proper anon_vma locking even
92 * for the new allocation. At the same time, we do not want
93 * to do any locking for the common case of already having
94 * an anon_vma.
95 *
96 * This must be called with the mmap_sem held for reading.
97 */
98int anon_vma_prepare(struct vm_area_struct *vma)
99{
100	struct anon_vma *anon_vma = vma->anon_vma;
101
102	might_sleep();
103	if (unlikely(!anon_vma)) {
104		struct mm_struct *mm = vma->vm_mm;
105		struct anon_vma *allocated;
106
107		anon_vma = find_mergeable_anon_vma(vma);
108		allocated = NULL;
109		if (!anon_vma) {
110			anon_vma = anon_vma_alloc();
111			if (unlikely(!anon_vma))
112				return -ENOMEM;
113			allocated = anon_vma;
114		}
115		spin_lock(&anon_vma->lock);
116
117		/* page_table_lock to protect against threads */
118		spin_lock(&mm->page_table_lock);
119		if (likely(!vma->anon_vma)) {
120			vma->anon_vma = anon_vma;
121			list_add_tail(&vma->anon_vma_node, &anon_vma->head);
122			allocated = NULL;
123		}
124		spin_unlock(&mm->page_table_lock);
125
126		spin_unlock(&anon_vma->lock);
127		if (unlikely(allocated))
128			anon_vma_free(allocated);
129	}
130	return 0;
131}
132
133void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
134{
135	BUG_ON(vma->anon_vma != next->anon_vma);
136	list_del(&next->anon_vma_node);
137}
138
139void __anon_vma_link(struct vm_area_struct *vma)
140{
141	struct anon_vma *anon_vma = vma->anon_vma;
142
143	if (anon_vma)
144		list_add_tail(&vma->anon_vma_node, &anon_vma->head);
145}
146
147void anon_vma_link(struct vm_area_struct *vma)
148{
149	struct anon_vma *anon_vma = vma->anon_vma;
150
151	if (anon_vma) {
152		spin_lock(&anon_vma->lock);
153		list_add_tail(&vma->anon_vma_node, &anon_vma->head);
154		spin_unlock(&anon_vma->lock);
155	}
156}
157
158void anon_vma_unlink(struct vm_area_struct *vma)
159{
160	struct anon_vma *anon_vma = vma->anon_vma;
161	int empty;
162
163	if (!anon_vma)
164		return;
165
166	spin_lock(&anon_vma->lock);
167	list_del(&vma->anon_vma_node);
168
169	/* We must garbage collect the anon_vma if it's empty */
170	empty = list_empty(&anon_vma->head);
171	spin_unlock(&anon_vma->lock);
172
173	if (empty)
174		anon_vma_free(anon_vma);
175}
176
177static void anon_vma_ctor(void *data)
178{
179	struct anon_vma *anon_vma = data;
180
181	spin_lock_init(&anon_vma->lock);
182	INIT_LIST_HEAD(&anon_vma->head);
183}
184
185void __init anon_vma_init(void)
186{
187	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
188			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
189}
190
191/*
192 * Getting a lock on a stable anon_vma from a page off the LRU is
193 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
194 */
195struct anon_vma *page_lock_anon_vma(struct page *page)
196{
197	struct anon_vma *anon_vma;
198	unsigned long anon_mapping;
199
200	rcu_read_lock();
201	anon_mapping = (unsigned long) page->mapping;
202	if (!(anon_mapping & PAGE_MAPPING_ANON))
203		goto out;
204	if (!page_mapped(page))
205		goto out;
206
207	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
208	spin_lock(&anon_vma->lock);
209	return anon_vma;
210out:
211	rcu_read_unlock();
212	return NULL;
213}
214
215void page_unlock_anon_vma(struct anon_vma *anon_vma)
216{
217	spin_unlock(&anon_vma->lock);
218	rcu_read_unlock();
219}
220
221/*
222 * At what user virtual address is page expected in @vma?
223 * Returns virtual address or -EFAULT if page's index/offset is not
224 * within the range mapped the @vma.
225 */
226static inline unsigned long
227vma_address(struct page *page, struct vm_area_struct *vma)
228{
229	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
230	unsigned long address;
231
232	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
233	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
234		/* page should be within @vma mapping range */
235		return -EFAULT;
236	}
237	return address;
238}
239
240/*
241 * At what user virtual address is page expected in vma? checking that the
242 * page matches the vma: currently only used on anon pages, by unuse_vma;
243 */
244unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
245{
246	if (PageAnon(page)) {
247		if ((void *)vma->anon_vma !=
248		    (void *)page->mapping - PAGE_MAPPING_ANON)
249			return -EFAULT;
250	} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
251		if (!vma->vm_file ||
252		    vma->vm_file->f_mapping != page->mapping)
253			return -EFAULT;
254	} else
255		return -EFAULT;
256	return vma_address(page, vma);
257}
258
259/*
260 * Check that @page is mapped at @address into @mm.
261 *
262 * If @sync is false, page_check_address may perform a racy check to avoid
263 * the page table lock when the pte is not present (helpful when reclaiming
264 * highly shared pages).
265 *
266 * On success returns with pte mapped and locked.
267 */
268pte_t *page_check_address(struct page *page, struct mm_struct *mm,
269			  unsigned long address, spinlock_t **ptlp, int sync)
270{
271	pgd_t *pgd;
272	pud_t *pud;
273	pmd_t *pmd;
274	pte_t *pte;
275	spinlock_t *ptl;
276
277	pgd = pgd_offset(mm, address);
278	if (!pgd_present(*pgd))
279		return NULL;
280
281	pud = pud_offset(pgd, address);
282	if (!pud_present(*pud))
283		return NULL;
284
285	pmd = pmd_offset(pud, address);
286	if (!pmd_present(*pmd))
287		return NULL;
288
289	pte = pte_offset_map(pmd, address);
290	/* Make a quick check before getting the lock */
291	if (!sync && !pte_present(*pte)) {
292		pte_unmap(pte);
293		return NULL;
294	}
295
296	ptl = pte_lockptr(mm, pmd);
297	spin_lock(ptl);
298	if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
299		*ptlp = ptl;
300		return pte;
301	}
302	pte_unmap_unlock(pte, ptl);
303	return NULL;
304}
305
306/**
307 * page_mapped_in_vma - check whether a page is really mapped in a VMA
308 * @page: the page to test
309 * @vma: the VMA to test
310 *
311 * Returns 1 if the page is mapped into the page tables of the VMA, 0
312 * if the page is not mapped into the page tables of this VMA.  Only
313 * valid for normal file or anonymous VMAs.
314 */
315static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
316{
317	unsigned long address;
318	pte_t *pte;
319	spinlock_t *ptl;
320
321	address = vma_address(page, vma);
322	if (address == -EFAULT)		/* out of vma range */
323		return 0;
324	pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
325	if (!pte)			/* the page is not in this mm */
326		return 0;
327	pte_unmap_unlock(pte, ptl);
328
329	return 1;
330}
331
332/*
333 * Subfunctions of page_referenced: page_referenced_one called
334 * repeatedly from either page_referenced_anon or page_referenced_file.
335 */
336static int page_referenced_one(struct page *page,
337	struct vm_area_struct *vma, unsigned int *mapcount)
338{
339	struct mm_struct *mm = vma->vm_mm;
340	unsigned long address;
341	pte_t *pte;
342	spinlock_t *ptl;
343	int referenced = 0;
344
345	address = vma_address(page, vma);
346	if (address == -EFAULT)
347		goto out;
348
349	pte = page_check_address(page, mm, address, &ptl, 0);
350	if (!pte)
351		goto out;
352
353	/*
354	 * Don't want to elevate referenced for mlocked page that gets this far,
355	 * in order that it progresses to try_to_unmap and is moved to the
356	 * unevictable list.
357	 */
358	if (vma->vm_flags & VM_LOCKED) {
359		*mapcount = 1;	/* break early from loop */
360		goto out_unmap;
361	}
362
363	if (ptep_clear_flush_young_notify(vma, address, pte))
364		referenced++;
365
366	/* Pretend the page is referenced if the task has the
367	   swap token and is in the middle of a page fault. */
368	if (mm != current->mm && has_swap_token(mm) &&
369			rwsem_is_locked(&mm->mmap_sem))
370		referenced++;
371
372out_unmap:
373	(*mapcount)--;
374	pte_unmap_unlock(pte, ptl);
375out:
376	return referenced;
377}
378
379static int page_referenced_anon(struct page *page,
380				struct mem_cgroup *mem_cont)
381{
382	unsigned int mapcount;
383	struct anon_vma *anon_vma;
384	struct vm_area_struct *vma;
385	int referenced = 0;
386
387	anon_vma = page_lock_anon_vma(page);
388	if (!anon_vma)
389		return referenced;
390
391	mapcount = page_mapcount(page);
392	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
393		/*
394		 * If we are reclaiming on behalf of a cgroup, skip
395		 * counting on behalf of references from different
396		 * cgroups
397		 */
398		if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
399			continue;
400		referenced += page_referenced_one(page, vma, &mapcount);
401		if (!mapcount)
402			break;
403	}
404
405	page_unlock_anon_vma(anon_vma);
406	return referenced;
407}
408
409/**
410 * page_referenced_file - referenced check for object-based rmap
411 * @page: the page we're checking references on.
412 * @mem_cont: target memory controller
413 *
414 * For an object-based mapped page, find all the places it is mapped and
415 * check/clear the referenced flag.  This is done by following the page->mapping
416 * pointer, then walking the chain of vmas it holds.  It returns the number
417 * of references it found.
418 *
419 * This function is only called from page_referenced for object-based pages.
420 */
421static int page_referenced_file(struct page *page,
422				struct mem_cgroup *mem_cont)
423{
424	unsigned int mapcount;
425	struct address_space *mapping = page->mapping;
426	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
427	struct vm_area_struct *vma;
428	struct prio_tree_iter iter;
429	int referenced = 0;
430
431	/*
432	 * The caller's checks on page->mapping and !PageAnon have made
433	 * sure that this is a file page: the check for page->mapping
434	 * excludes the case just before it gets set on an anon page.
435	 */
436	BUG_ON(PageAnon(page));
437
438	/*
439	 * The page lock not only makes sure that page->mapping cannot
440	 * suddenly be NULLified by truncation, it makes sure that the
441	 * structure at mapping cannot be freed and reused yet,
442	 * so we can safely take mapping->i_mmap_lock.
443	 */
444	BUG_ON(!PageLocked(page));
445
446	spin_lock(&mapping->i_mmap_lock);
447
448	/*
449	 * i_mmap_lock does not stabilize mapcount at all, but mapcount
450	 * is more likely to be accurate if we note it after spinning.
451	 */
452	mapcount = page_mapcount(page);
453
454	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
455		/*
456		 * If we are reclaiming on behalf of a cgroup, skip
457		 * counting on behalf of references from different
458		 * cgroups
459		 */
460		if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
461			continue;
462		referenced += page_referenced_one(page, vma, &mapcount);
463		if (!mapcount)
464			break;
465	}
466
467	spin_unlock(&mapping->i_mmap_lock);
468	return referenced;
469}
470
471/**
472 * page_referenced - test if the page was referenced
473 * @page: the page to test
474 * @is_locked: caller holds lock on the page
475 * @mem_cont: target memory controller
476 *
477 * Quick test_and_clear_referenced for all mappings to a page,
478 * returns the number of ptes which referenced the page.
479 */
480int page_referenced(struct page *page, int is_locked,
481			struct mem_cgroup *mem_cont)
482{
483	int referenced = 0;
484
485	if (TestClearPageReferenced(page))
486		referenced++;
487
488	if (page_mapped(page) && page->mapping) {
489		if (PageAnon(page))
490			referenced += page_referenced_anon(page, mem_cont);
491		else if (is_locked)
492			referenced += page_referenced_file(page, mem_cont);
493		else if (!trylock_page(page))
494			referenced++;
495		else {
496			if (page->mapping)
497				referenced +=
498					page_referenced_file(page, mem_cont);
499			unlock_page(page);
500		}
501	}
502
503	if (page_test_and_clear_young(page))
504		referenced++;
505
506	return referenced;
507}
508
509static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
510{
511	struct mm_struct *mm = vma->vm_mm;
512	unsigned long address;
513	pte_t *pte;
514	spinlock_t *ptl;
515	int ret = 0;
516
517	address = vma_address(page, vma);
518	if (address == -EFAULT)
519		goto out;
520
521	pte = page_check_address(page, mm, address, &ptl, 1);
522	if (!pte)
523		goto out;
524
525	if (pte_dirty(*pte) || pte_write(*pte)) {
526		pte_t entry;
527
528		flush_cache_page(vma, address, pte_pfn(*pte));
529		entry = ptep_clear_flush_notify(vma, address, pte);
530		entry = pte_wrprotect(entry);
531		entry = pte_mkclean(entry);
532		set_pte_at(mm, address, pte, entry);
533		ret = 1;
534	}
535
536	pte_unmap_unlock(pte, ptl);
537out:
538	return ret;
539}
540
541static int page_mkclean_file(struct address_space *mapping, struct page *page)
542{
543	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
544	struct vm_area_struct *vma;
545	struct prio_tree_iter iter;
546	int ret = 0;
547
548	BUG_ON(PageAnon(page));
549
550	spin_lock(&mapping->i_mmap_lock);
551	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
552		if (vma->vm_flags & VM_SHARED)
553			ret += page_mkclean_one(page, vma);
554	}
555	spin_unlock(&mapping->i_mmap_lock);
556	return ret;
557}
558
559int page_mkclean(struct page *page)
560{
561	int ret = 0;
562
563	BUG_ON(!PageLocked(page));
564
565	if (page_mapped(page)) {
566		struct address_space *mapping = page_mapping(page);
567		if (mapping) {
568			ret = page_mkclean_file(mapping, page);
569			if (page_test_dirty(page)) {
570				page_clear_dirty(page);
571				ret = 1;
572			}
573		}
574	}
575
576	return ret;
577}
578EXPORT_SYMBOL_GPL(page_mkclean);
579
580/**
581 * __page_set_anon_rmap - setup new anonymous rmap
582 * @page:	the page to add the mapping to
583 * @vma:	the vm area in which the mapping is added
584 * @address:	the user virtual address mapped
585 */
586static void __page_set_anon_rmap(struct page *page,
587	struct vm_area_struct *vma, unsigned long address)
588{
589	struct anon_vma *anon_vma = vma->anon_vma;
590
591	BUG_ON(!anon_vma);
592	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
593	page->mapping = (struct address_space *) anon_vma;
594
595	page->index = linear_page_index(vma, address);
596
597	/*
598	 * nr_mapped state can be updated without turning off
599	 * interrupts because it is not modified via interrupt.
600	 */
601	__inc_zone_page_state(page, NR_ANON_PAGES);
602}
603
604/**
605 * __page_check_anon_rmap - sanity check anonymous rmap addition
606 * @page:	the page to add the mapping to
607 * @vma:	the vm area in which the mapping is added
608 * @address:	the user virtual address mapped
609 */
610static void __page_check_anon_rmap(struct page *page,
611	struct vm_area_struct *vma, unsigned long address)
612{
613#ifdef CONFIG_DEBUG_VM
614	/*
615	 * The page's anon-rmap details (mapping and index) are guaranteed to
616	 * be set up correctly at this point.
617	 *
618	 * We have exclusion against page_add_anon_rmap because the caller
619	 * always holds the page locked, except if called from page_dup_rmap,
620	 * in which case the page is already known to be setup.
621	 *
622	 * We have exclusion against page_add_new_anon_rmap because those pages
623	 * are initially only visible via the pagetables, and the pte is locked
624	 * over the call to page_add_new_anon_rmap.
625	 */
626	struct anon_vma *anon_vma = vma->anon_vma;
627	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
628	BUG_ON(page->mapping != (struct address_space *)anon_vma);
629	BUG_ON(page->index != linear_page_index(vma, address));
630#endif
631}
632
633/**
634 * page_add_anon_rmap - add pte mapping to an anonymous page
635 * @page:	the page to add the mapping to
636 * @vma:	the vm area in which the mapping is added
637 * @address:	the user virtual address mapped
638 *
639 * The caller needs to hold the pte lock and the page must be locked.
640 */
641void page_add_anon_rmap(struct page *page,
642	struct vm_area_struct *vma, unsigned long address)
643{
644	VM_BUG_ON(!PageLocked(page));
645	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
646	if (atomic_inc_and_test(&page->_mapcount))
647		__page_set_anon_rmap(page, vma, address);
648	else
649		__page_check_anon_rmap(page, vma, address);
650}
651
652/**
653 * page_add_new_anon_rmap - add pte mapping to a new anonymous page
654 * @page:	the page to add the mapping to
655 * @vma:	the vm area in which the mapping is added
656 * @address:	the user virtual address mapped
657 *
658 * Same as page_add_anon_rmap but must only be called on *new* pages.
659 * This means the inc-and-test can be bypassed.
660 * Page does not have to be locked.
661 */
662void page_add_new_anon_rmap(struct page *page,
663	struct vm_area_struct *vma, unsigned long address)
664{
665	BUG_ON(address < vma->vm_start || address >= vma->vm_end);
666	atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
667	__page_set_anon_rmap(page, vma, address);
668}
669
670/**
671 * page_add_file_rmap - add pte mapping to a file page
672 * @page: the page to add the mapping to
673 *
674 * The caller needs to hold the pte lock.
675 */
676void page_add_file_rmap(struct page *page)
677{
678	if (atomic_inc_and_test(&page->_mapcount))
679		__inc_zone_page_state(page, NR_FILE_MAPPED);
680}
681
682#ifdef CONFIG_DEBUG_VM
683/**
684 * page_dup_rmap - duplicate pte mapping to a page
685 * @page:	the page to add the mapping to
686 * @vma:	the vm area being duplicated
687 * @address:	the user virtual address mapped
688 *
689 * For copy_page_range only: minimal extract from page_add_file_rmap /
690 * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's
691 * quicker.
692 *
693 * The caller needs to hold the pte lock.
694 */
695void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
696{
697	BUG_ON(page_mapcount(page) == 0);
698	if (PageAnon(page))
699		__page_check_anon_rmap(page, vma, address);
700	atomic_inc(&page->_mapcount);
701}
702#endif
703
704/**
705 * page_remove_rmap - take down pte mapping from a page
706 * @page: page to remove mapping from
707 * @vma: the vm area in which the mapping is removed
708 *
709 * The caller needs to hold the pte lock.
710 */
711void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
712{
713	if (atomic_add_negative(-1, &page->_mapcount)) {
714		if (unlikely(page_mapcount(page) < 0)) {
715			printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
716			printk (KERN_EMERG "  page pfn = %lx\n", page_to_pfn(page));
717			printk (KERN_EMERG "  page->flags = %lx\n", page->flags);
718			printk (KERN_EMERG "  page->count = %x\n", page_count(page));
719			printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
720			print_symbol (KERN_EMERG "  vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
721			if (vma->vm_ops) {
722				print_symbol (KERN_EMERG "  vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault);
723			}
724			if (vma->vm_file && vma->vm_file->f_op)
725				print_symbol (KERN_EMERG "  vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
726			BUG();
727		}
728
729		/*
730		 * Now that the last pte has gone, s390 must transfer dirty
731		 * flag from storage key to struct page.  We can usually skip
732		 * this if the page is anon, so about to be freed; but perhaps
733		 * not if it's in swapcache - there might be another pte slot
734		 * containing the swap entry, but page not yet written to swap.
735		 */
736		if ((!PageAnon(page) || PageSwapCache(page)) &&
737		    page_test_dirty(page)) {
738			page_clear_dirty(page);
739			set_page_dirty(page);
740		}
741		if (PageAnon(page))
742			mem_cgroup_uncharge_page(page);
743		__dec_zone_page_state(page,
744			PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
745		/*
746		 * It would be tidy to reset the PageAnon mapping here,
747		 * but that might overwrite a racing page_add_anon_rmap
748		 * which increments mapcount after us but sets mapping
749		 * before us: so leave the reset to free_hot_cold_page,
750		 * and remember that it's only reliable while mapped.
751		 * Leaving it set also helps swapoff to reinstate ptes
752		 * faster for those pages still in swapcache.
753		 */
754	}
755}
756
757/*
758 * Subfunctions of try_to_unmap: try_to_unmap_one called
759 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
760 */
761static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
762				int migration)
763{
764	struct mm_struct *mm = vma->vm_mm;
765	unsigned long address;
766	pte_t *pte;
767	pte_t pteval;
768	spinlock_t *ptl;
769	int ret = SWAP_AGAIN;
770
771	address = vma_address(page, vma);
772	if (address == -EFAULT)
773		goto out;
774
775	pte = page_check_address(page, mm, address, &ptl, 0);
776	if (!pte)
777		goto out;
778
779	/*
780	 * If the page is mlock()d, we cannot swap it out.
781	 * If it's recently referenced (perhaps page_referenced
782	 * skipped over this mm) then we should reactivate it.
783	 */
784	if (!migration) {
785		if (vma->vm_flags & VM_LOCKED) {
786			ret = SWAP_MLOCK;
787			goto out_unmap;
788		}
789		if (ptep_clear_flush_young_notify(vma, address, pte)) {
790			ret = SWAP_FAIL;
791			goto out_unmap;
792		}
793  	}
794
795	/* Nuke the page table entry. */
796	flush_cache_page(vma, address, page_to_pfn(page));
797	pteval = ptep_clear_flush_notify(vma, address, pte);
798
799	/* Move the dirty bit to the physical page now the pte is gone. */
800	if (pte_dirty(pteval))
801		set_page_dirty(page);
802
803	/* Update high watermark before we lower rss */
804	update_hiwater_rss(mm);
805
806	if (PageAnon(page)) {
807		swp_entry_t entry = { .val = page_private(page) };
808
809		if (PageSwapCache(page)) {
810			/*
811			 * Store the swap location in the pte.
812			 * See handle_pte_fault() ...
813			 */
814			swap_duplicate(entry);
815			if (list_empty(&mm->mmlist)) {
816				spin_lock(&mmlist_lock);
817				if (list_empty(&mm->mmlist))
818					list_add(&mm->mmlist, &init_mm.mmlist);
819				spin_unlock(&mmlist_lock);
820			}
821			dec_mm_counter(mm, anon_rss);
822		} else if (PAGE_MIGRATION) {
823			/*
824			 * Store the pfn of the page in a special migration
825			 * pte. do_swap_page() will wait until the migration
826			 * pte is removed and then restart fault handling.
827			 */
828			BUG_ON(!migration);
829			entry = make_migration_entry(page, pte_write(pteval));
830		}
831		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
832		BUG_ON(pte_file(*pte));
833	} else if (PAGE_MIGRATION && migration) {
834		/* Establish migration entry for a file page */
835		swp_entry_t entry;
836		entry = make_migration_entry(page, pte_write(pteval));
837		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
838	} else
839		dec_mm_counter(mm, file_rss);
840
841
842	page_remove_rmap(page, vma);
843	page_cache_release(page);
844
845out_unmap:
846	pte_unmap_unlock(pte, ptl);
847out:
848	return ret;
849}
850
851/*
852 * objrmap doesn't work for nonlinear VMAs because the assumption that
853 * offset-into-file correlates with offset-into-virtual-addresses does not hold.
854 * Consequently, given a particular page and its ->index, we cannot locate the
855 * ptes which are mapping that page without an exhaustive linear search.
856 *
857 * So what this code does is a mini "virtual scan" of each nonlinear VMA which
858 * maps the file to which the target page belongs.  The ->vm_private_data field
859 * holds the current cursor into that scan.  Successive searches will circulate
860 * around the vma's virtual address space.
861 *
862 * So as more replacement pressure is applied to the pages in a nonlinear VMA,
863 * more scanning pressure is placed against them as well.   Eventually pages
864 * will become fully unmapped and are eligible for eviction.
865 *
866 * For very sparsely populated VMAs this is a little inefficient - chances are
867 * there there won't be many ptes located within the scan cluster.  In this case
868 * maybe we could scan further - to the end of the pte page, perhaps.
869 *
870 * Mlocked pages:  check VM_LOCKED under mmap_sem held for read, if we can
871 * acquire it without blocking.  If vma locked, mlock the pages in the cluster,
872 * rather than unmapping them.  If we encounter the "check_page" that vmscan is
873 * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
874 */
875#define CLUSTER_SIZE	min(32*PAGE_SIZE, PMD_SIZE)
876#define CLUSTER_MASK	(~(CLUSTER_SIZE - 1))
877
878static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
879		struct vm_area_struct *vma, struct page *check_page)
880{
881	struct mm_struct *mm = vma->vm_mm;
882	pgd_t *pgd;
883	pud_t *pud;
884	pmd_t *pmd;
885	pte_t *pte;
886	pte_t pteval;
887	spinlock_t *ptl;
888	struct page *page;
889	unsigned long address;
890	unsigned long end;
891	int ret = SWAP_AGAIN;
892	int locked_vma = 0;
893
894	address = (vma->vm_start + cursor) & CLUSTER_MASK;
895	end = address + CLUSTER_SIZE;
896	if (address < vma->vm_start)
897		address = vma->vm_start;
898	if (end > vma->vm_end)
899		end = vma->vm_end;
900
901	pgd = pgd_offset(mm, address);
902	if (!pgd_present(*pgd))
903		return ret;
904
905	pud = pud_offset(pgd, address);
906	if (!pud_present(*pud))
907		return ret;
908
909	pmd = pmd_offset(pud, address);
910	if (!pmd_present(*pmd))
911		return ret;
912
913	/*
914	 * MLOCK_PAGES => feature is configured.
915	 * if we can acquire the mmap_sem for read, and vma is VM_LOCKED,
916	 * keep the sem while scanning the cluster for mlocking pages.
917	 */
918	if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) {
919		locked_vma = (vma->vm_flags & VM_LOCKED);
920		if (!locked_vma)
921			up_read(&vma->vm_mm->mmap_sem); /* don't need it */
922	}
923
924	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
925
926	/* Update high watermark before we lower rss */
927	update_hiwater_rss(mm);
928
929	for (; address < end; pte++, address += PAGE_SIZE) {
930		if (!pte_present(*pte))
931			continue;
932		page = vm_normal_page(vma, address, *pte);
933		BUG_ON(!page || PageAnon(page));
934
935		if (locked_vma) {
936			mlock_vma_page(page);   /* no-op if already mlocked */
937			if (page == check_page)
938				ret = SWAP_MLOCK;
939			continue;	/* don't unmap */
940		}
941
942		if (ptep_clear_flush_young_notify(vma, address, pte))
943			continue;
944
945		/* Nuke the page table entry. */
946		flush_cache_page(vma, address, pte_pfn(*pte));
947		pteval = ptep_clear_flush_notify(vma, address, pte);
948
949		/* If nonlinear, store the file page offset in the pte. */
950		if (page->index != linear_page_index(vma, address))
951			set_pte_at(mm, address, pte, pgoff_to_pte(page->index));
952
953		/* Move the dirty bit to the physical page now the pte is gone. */
954		if (pte_dirty(pteval))
955			set_page_dirty(page);
956
957		page_remove_rmap(page, vma);
958		page_cache_release(page);
959		dec_mm_counter(mm, file_rss);
960		(*mapcount)--;
961	}
962	pte_unmap_unlock(pte - 1, ptl);
963	if (locked_vma)
964		up_read(&vma->vm_mm->mmap_sem);
965	return ret;
966}
967
968/*
969 * common handling for pages mapped in VM_LOCKED vmas
970 */
971static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
972{
973	int mlocked = 0;
974
975	if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
976		if (vma->vm_flags & VM_LOCKED) {
977			mlock_vma_page(page);
978			mlocked++;	/* really mlocked the page */
979		}
980		up_read(&vma->vm_mm->mmap_sem);
981	}
982	return mlocked;
983}
984
985/**
986 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
987 * rmap method
988 * @page: the page to unmap/unlock
989 * @unlock:  request for unlock rather than unmap [unlikely]
990 * @migration:  unmapping for migration - ignored if @unlock
991 *
992 * Find all the mappings of a page using the mapping pointer and the vma chains
993 * contained in the anon_vma struct it points to.
994 *
995 * This function is only called from try_to_unmap/try_to_munlock for
996 * anonymous pages.
997 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
998 * where the page was found will be held for write.  So, we won't recheck
999 * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
1000 * 'LOCKED.
1001 */
1002static int try_to_unmap_anon(struct page *page, int unlock, int migration)
1003{
1004	struct anon_vma *anon_vma;
1005	struct vm_area_struct *vma;
1006	unsigned int mlocked = 0;
1007	int ret = SWAP_AGAIN;
1008
1009	if (MLOCK_PAGES && unlikely(unlock))
1010		ret = SWAP_SUCCESS;	/* default for try_to_munlock() */
1011
1012	anon_vma = page_lock_anon_vma(page);
1013	if (!anon_vma)
1014		return ret;
1015
1016	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
1017		if (MLOCK_PAGES && unlikely(unlock)) {
1018			if (!((vma->vm_flags & VM_LOCKED) &&
1019			      page_mapped_in_vma(page, vma)))
1020				continue;  /* must visit all unlocked vmas */
1021			ret = SWAP_MLOCK;  /* saw at least one mlocked vma */
1022		} else {
1023			ret = try_to_unmap_one(page, vma, migration);
1024			if (ret == SWAP_FAIL || !page_mapped(page))
1025				break;
1026		}
1027		if (ret == SWAP_MLOCK) {
1028			mlocked = try_to_mlock_page(page, vma);
1029			if (mlocked)
1030				break;	/* stop if actually mlocked page */
1031		}
1032	}
1033
1034	page_unlock_anon_vma(anon_vma);
1035
1036	if (mlocked)
1037		ret = SWAP_MLOCK;	/* actually mlocked the page */
1038	else if (ret == SWAP_MLOCK)
1039		ret = SWAP_AGAIN;	/* saw VM_LOCKED vma */
1040
1041	return ret;
1042}
1043
1044/**
1045 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
1046 * @page: the page to unmap/unlock
1047 * @unlock:  request for unlock rather than unmap [unlikely]
1048 * @migration:  unmapping for migration - ignored if @unlock
1049 *
1050 * Find all the mappings of a page using the mapping pointer and the vma chains
1051 * contained in the address_space struct it points to.
1052 *
1053 * This function is only called from try_to_unmap/try_to_munlock for
1054 * object-based pages.
1055 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1056 * where the page was found will be held for write.  So, we won't recheck
1057 * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
1058 * 'LOCKED.
1059 */
1060static int try_to_unmap_file(struct page *page, int unlock, int migration)
1061{
1062	struct address_space *mapping = page->mapping;
1063	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1064	struct vm_area_struct *vma;
1065	struct prio_tree_iter iter;
1066	int ret = SWAP_AGAIN;
1067	unsigned long cursor;
1068	unsigned long max_nl_cursor = 0;
1069	unsigned long max_nl_size = 0;
1070	unsigned int mapcount;
1071	unsigned int mlocked = 0;
1072
1073	if (MLOCK_PAGES && unlikely(unlock))
1074		ret = SWAP_SUCCESS;	/* default for try_to_munlock() */
1075
1076	spin_lock(&mapping->i_mmap_lock);
1077	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1078		if (MLOCK_PAGES && unlikely(unlock)) {
1079			if (!(vma->vm_flags & VM_LOCKED))
1080				continue;	/* must visit all vmas */
1081			ret = SWAP_MLOCK;
1082		} else {
1083			ret = try_to_unmap_one(page, vma, migration);
1084			if (ret == SWAP_FAIL || !page_mapped(page))
1085				goto out;
1086		}
1087		if (ret == SWAP_MLOCK) {
1088			mlocked = try_to_mlock_page(page, vma);
1089			if (mlocked)
1090				break;  /* stop if actually mlocked page */
1091		}
1092	}
1093
1094	if (mlocked)
1095		goto out;
1096
1097	if (list_empty(&mapping->i_mmap_nonlinear))
1098		goto out;
1099
1100	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1101						shared.vm_set.list) {
1102		if (MLOCK_PAGES && unlikely(unlock)) {
1103			if (!(vma->vm_flags & VM_LOCKED))
1104				continue;	/* must visit all vmas */
1105			ret = SWAP_MLOCK;	/* leave mlocked == 0 */
1106			goto out;		/* no need to look further */
1107		}
1108		if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED))
1109			continue;
1110		cursor = (unsigned long) vma->vm_private_data;
1111		if (cursor > max_nl_cursor)
1112			max_nl_cursor = cursor;
1113		cursor = vma->vm_end - vma->vm_start;
1114		if (cursor > max_nl_size)
1115			max_nl_size = cursor;
1116	}
1117
1118	if (max_nl_size == 0) {	/* all nonlinears locked or reserved ? */
1119		ret = SWAP_FAIL;
1120		goto out;
1121	}
1122
1123	/*
1124	 * We don't try to search for this page in the nonlinear vmas,
1125	 * and page_referenced wouldn't have found it anyway.  Instead
1126	 * just walk the nonlinear vmas trying to age and unmap some.
1127	 * The mapcount of the page we came in with is irrelevant,
1128	 * but even so use it as a guide to how hard we should try?
1129	 */
1130	mapcount = page_mapcount(page);
1131	if (!mapcount)
1132		goto out;
1133	cond_resched_lock(&mapping->i_mmap_lock);
1134
1135	max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
1136	if (max_nl_cursor == 0)
1137		max_nl_cursor = CLUSTER_SIZE;
1138
1139	do {
1140		list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1141						shared.vm_set.list) {
1142			if (!MLOCK_PAGES && !migration &&
1143			    (vma->vm_flags & VM_LOCKED))
1144				continue;
1145			cursor = (unsigned long) vma->vm_private_data;
1146			while ( cursor < max_nl_cursor &&
1147				cursor < vma->vm_end - vma->vm_start) {
1148				ret = try_to_unmap_cluster(cursor, &mapcount,
1149								vma, page);
1150				if (ret == SWAP_MLOCK)
1151					mlocked = 2;	/* to return below */
1152				cursor += CLUSTER_SIZE;
1153				vma->vm_private_data = (void *) cursor;
1154				if ((int)mapcount <= 0)
1155					goto out;
1156			}
1157			vma->vm_private_data = (void *) max_nl_cursor;
1158		}
1159		cond_resched_lock(&mapping->i_mmap_lock);
1160		max_nl_cursor += CLUSTER_SIZE;
1161	} while (max_nl_cursor <= max_nl_size);
1162
1163	/*
1164	 * Don't loop forever (perhaps all the remaining pages are
1165	 * in locked vmas).  Reset cursor on all unreserved nonlinear
1166	 * vmas, now forgetting on which ones it had fallen behind.
1167	 */
1168	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
1169		vma->vm_private_data = NULL;
1170out:
1171	spin_unlock(&mapping->i_mmap_lock);
1172	if (mlocked)
1173		ret = SWAP_MLOCK;	/* actually mlocked the page */
1174	else if (ret == SWAP_MLOCK)
1175		ret = SWAP_AGAIN;	/* saw VM_LOCKED vma */
1176	return ret;
1177}
1178
1179/**
1180 * try_to_unmap - try to remove all page table mappings to a page
1181 * @page: the page to get unmapped
1182 * @migration: migration flag
1183 *
1184 * Tries to remove all the page table entries which are mapping this
1185 * page, used in the pageout path.  Caller must hold the page lock.
1186 * Return values are:
1187 *
1188 * SWAP_SUCCESS	- we succeeded in removing all mappings
1189 * SWAP_AGAIN	- we missed a mapping, try again later
1190 * SWAP_FAIL	- the page is unswappable
1191 * SWAP_MLOCK	- page is mlocked.
1192 */
1193int try_to_unmap(struct page *page, int migration)
1194{
1195	int ret;
1196
1197	BUG_ON(!PageLocked(page));
1198
1199	if (PageAnon(page))
1200		ret = try_to_unmap_anon(page, 0, migration);
1201	else
1202		ret = try_to_unmap_file(page, 0, migration);
1203	if (ret != SWAP_MLOCK && !page_mapped(page))
1204		ret = SWAP_SUCCESS;
1205	return ret;
1206}
1207
1208#ifdef CONFIG_UNEVICTABLE_LRU
1209/**
1210 * try_to_munlock - try to munlock a page
1211 * @page: the page to be munlocked
1212 *
1213 * Called from munlock code.  Checks all of the VMAs mapping the page
1214 * to make sure nobody else has this page mlocked. The page will be
1215 * returned with PG_mlocked cleared if no other vmas have it mlocked.
1216 *
1217 * Return values are:
1218 *
1219 * SWAP_SUCCESS	- no vma's holding page mlocked.
1220 * SWAP_AGAIN	- page mapped in mlocked vma -- couldn't acquire mmap sem
1221 * SWAP_MLOCK	- page is now mlocked.
1222 */
1223int try_to_munlock(struct page *page)
1224{
1225	VM_BUG_ON(!PageLocked(page) || PageLRU(page));
1226
1227	if (PageAnon(page))
1228		return try_to_unmap_anon(page, 1, 0);
1229	else
1230		return try_to_unmap_file(page, 1, 0);
1231}
1232#endif
1233