swap_state.c revision bd53b714d32a29bdf33009f812e295667e92b930
1/*
2 *  linux/mm/swap_state.c
3 *
4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
5 *  Swap reorganised 29.12.95, Stephen Tweedie
6 *
7 *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
8 */
9#include <linux/module.h>
10#include <linux/mm.h>
11#include <linux/kernel_stat.h>
12#include <linux/swap.h>
13#include <linux/init.h>
14#include <linux/pagemap.h>
15#include <linux/buffer_head.h>
16#include <linux/backing-dev.h>
17
18#include <asm/pgtable.h>
19
20/*
21 * swapper_space is a fiction, retained to simplify the path through
22 * vmscan's shrink_list, to make sync_page look nicer, and to allow
23 * future use of radix_tree tags in the swap cache.
24 */
25static struct address_space_operations swap_aops = {
26	.writepage	= swap_writepage,
27	.sync_page	= block_sync_page,
28	.set_page_dirty	= __set_page_dirty_nobuffers,
29};
30
31static struct backing_dev_info swap_backing_dev_info = {
32	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
33	.unplug_io_fn	= swap_unplug_io_fn,
34};
35
36struct address_space swapper_space = {
37	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
38	.tree_lock	= RW_LOCK_UNLOCKED,
39	.a_ops		= &swap_aops,
40	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
41	.backing_dev_info = &swap_backing_dev_info,
42};
43EXPORT_SYMBOL(swapper_space);
44
45#define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)
46
47static struct {
48	unsigned long add_total;
49	unsigned long del_total;
50	unsigned long find_success;
51	unsigned long find_total;
52	unsigned long noent_race;
53	unsigned long exist_race;
54} swap_cache_info;
55
56void show_swap_cache_info(void)
57{
58	printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n",
59		swap_cache_info.add_total, swap_cache_info.del_total,
60		swap_cache_info.find_success, swap_cache_info.find_total,
61		swap_cache_info.noent_race, swap_cache_info.exist_race);
62	printk("Free swap  = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
63	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
64}
65
66/*
67 * __add_to_swap_cache resembles add_to_page_cache on swapper_space,
68 * but sets SwapCache flag and private instead of mapping and index.
69 */
70static int __add_to_swap_cache(struct page *page,
71		swp_entry_t entry, int gfp_mask)
72{
73	int error;
74
75	BUG_ON(PageSwapCache(page));
76	BUG_ON(PagePrivate(page));
77	error = radix_tree_preload(gfp_mask);
78	if (!error) {
79		write_lock_irq(&swapper_space.tree_lock);
80		error = radix_tree_insert(&swapper_space.page_tree,
81						entry.val, page);
82		if (!error) {
83			page_cache_get(page);
84			SetPageLocked(page);
85			SetPageSwapCache(page);
86			page->private = entry.val;
87			total_swapcache_pages++;
88			pagecache_acct(1);
89		}
90		write_unlock_irq(&swapper_space.tree_lock);
91		radix_tree_preload_end();
92	}
93	return error;
94}
95
96static int add_to_swap_cache(struct page *page, swp_entry_t entry)
97{
98	int error;
99
100	if (!swap_duplicate(entry)) {
101		INC_CACHE_INFO(noent_race);
102		return -ENOENT;
103	}
104	error = __add_to_swap_cache(page, entry, GFP_KERNEL);
105	/*
106	 * Anon pages are already on the LRU, we don't run lru_cache_add here.
107	 */
108	if (error) {
109		swap_free(entry);
110		if (error == -EEXIST)
111			INC_CACHE_INFO(exist_race);
112		return error;
113	}
114	INC_CACHE_INFO(add_total);
115	return 0;
116}
117
118/*
119 * This must be called only on pages that have
120 * been verified to be in the swap cache.
121 */
122void __delete_from_swap_cache(struct page *page)
123{
124	BUG_ON(!PageLocked(page));
125	BUG_ON(!PageSwapCache(page));
126	BUG_ON(PageWriteback(page));
127
128	radix_tree_delete(&swapper_space.page_tree, page->private);
129	page->private = 0;
130	ClearPageSwapCache(page);
131	total_swapcache_pages--;
132	pagecache_acct(-1);
133	INC_CACHE_INFO(del_total);
134}
135
136/**
137 * add_to_swap - allocate swap space for a page
138 * @page: page we want to move to swap
139 *
140 * Allocate swap space for the page and add the page to the
141 * swap cache.  Caller needs to hold the page lock.
142 */
143int add_to_swap(struct page * page)
144{
145	swp_entry_t entry;
146	int err;
147
148	if (!PageLocked(page))
149		BUG();
150
151	for (;;) {
152		entry = get_swap_page();
153		if (!entry.val)
154			return 0;
155
156		/*
157		 * Radix-tree node allocations from PF_MEMALLOC contexts could
158		 * completely exhaust the page allocator. __GFP_NOMEMALLOC
159		 * stops emergency reserves from being allocated.
160		 *
161		 * TODO: this could cause a theoretical memory reclaim
162		 * deadlock in the swap out path.
163		 */
164		/*
165		 * Add it to the swap cache and mark it dirty
166		 */
167		err = __add_to_swap_cache(page, entry,
168				GFP_ATOMIC|__GFP_NOMEMALLOC|__GFP_NOWARN);
169
170		switch (err) {
171		case 0:				/* Success */
172			SetPageUptodate(page);
173			SetPageDirty(page);
174			INC_CACHE_INFO(add_total);
175			return 1;
176		case -EEXIST:
177			/* Raced with "speculative" read_swap_cache_async */
178			INC_CACHE_INFO(exist_race);
179			swap_free(entry);
180			continue;
181		default:
182			/* -ENOMEM radix-tree allocation failure */
183			swap_free(entry);
184			return 0;
185		}
186	}
187}
188
189/*
190 * This must be called only on pages that have
191 * been verified to be in the swap cache and locked.
192 * It will never put the page into the free list,
193 * the caller has a reference on the page.
194 */
195void delete_from_swap_cache(struct page *page)
196{
197	swp_entry_t entry;
198
199	BUG_ON(!PageSwapCache(page));
200	BUG_ON(!PageLocked(page));
201	BUG_ON(PageWriteback(page));
202	BUG_ON(PagePrivate(page));
203
204	entry.val = page->private;
205
206	write_lock_irq(&swapper_space.tree_lock);
207	__delete_from_swap_cache(page);
208	write_unlock_irq(&swapper_space.tree_lock);
209
210	swap_free(entry);
211	page_cache_release(page);
212}
213
214/*
215 * Strange swizzling function only for use by shmem_writepage
216 */
217int move_to_swap_cache(struct page *page, swp_entry_t entry)
218{
219	int err = __add_to_swap_cache(page, entry, GFP_ATOMIC);
220	if (!err) {
221		remove_from_page_cache(page);
222		page_cache_release(page);	/* pagecache ref */
223		if (!swap_duplicate(entry))
224			BUG();
225		SetPageDirty(page);
226		INC_CACHE_INFO(add_total);
227	} else if (err == -EEXIST)
228		INC_CACHE_INFO(exist_race);
229	return err;
230}
231
232/*
233 * Strange swizzling function for shmem_getpage (and shmem_unuse)
234 */
235int move_from_swap_cache(struct page *page, unsigned long index,
236		struct address_space *mapping)
237{
238	int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC);
239	if (!err) {
240		delete_from_swap_cache(page);
241		/* shift page from clean_pages to dirty_pages list */
242		ClearPageDirty(page);
243		set_page_dirty(page);
244	}
245	return err;
246}
247
248/*
249 * If we are the only user, then try to free up the swap cache.
250 *
251 * Its ok to check for PageSwapCache without the page lock
252 * here because we are going to recheck again inside
253 * exclusive_swap_page() _with_ the lock.
254 * 					- Marcelo
255 */
256static inline void free_swap_cache(struct page *page)
257{
258	if (PageSwapCache(page) && !TestSetPageLocked(page)) {
259		remove_exclusive_swap_page(page);
260		unlock_page(page);
261	}
262}
263
264/*
265 * Perform a free_page(), also freeing any swap cache associated with
266 * this page if it is the last user of the page. Can not do a lock_page,
267 * as we are holding the page_table_lock spinlock.
268 */
269void free_page_and_swap_cache(struct page *page)
270{
271	free_swap_cache(page);
272	page_cache_release(page);
273}
274
275/*
276 * Passed an array of pages, drop them all from swapcache and then release
277 * them.  They are removed from the LRU and freed if this is their last use.
278 */
279void free_pages_and_swap_cache(struct page **pages, int nr)
280{
281	int chunk = 16;
282	struct page **pagep = pages;
283
284	lru_add_drain();
285	while (nr) {
286		int todo = min(chunk, nr);
287		int i;
288
289		for (i = 0; i < todo; i++)
290			free_swap_cache(pagep[i]);
291		release_pages(pagep, todo, 0);
292		pagep += todo;
293		nr -= todo;
294	}
295}
296
297/*
298 * Lookup a swap entry in the swap cache. A found page will be returned
299 * unlocked and with its refcount incremented - we rely on the kernel
300 * lock getting page table operations atomic even if we drop the page
301 * lock before returning.
302 */
303struct page * lookup_swap_cache(swp_entry_t entry)
304{
305	struct page *page;
306
307	page = find_get_page(&swapper_space, entry.val);
308
309	if (page)
310		INC_CACHE_INFO(find_success);
311
312	INC_CACHE_INFO(find_total);
313	return page;
314}
315
316/*
317 * Locate a page of swap in physical memory, reserving swap cache space
318 * and reading the disk if it is not already cached.
319 * A failure return means that either the page allocation failed or that
320 * the swap entry is no longer in use.
321 */
322struct page *read_swap_cache_async(swp_entry_t entry,
323			struct vm_area_struct *vma, unsigned long addr)
324{
325	struct page *found_page, *new_page = NULL;
326	int err;
327
328	do {
329		/*
330		 * First check the swap cache.  Since this is normally
331		 * called after lookup_swap_cache() failed, re-calling
332		 * that would confuse statistics.
333		 */
334		found_page = find_get_page(&swapper_space, entry.val);
335		if (found_page)
336			break;
337
338		/*
339		 * Get a new page to read into from swap.
340		 */
341		if (!new_page) {
342			new_page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
343			if (!new_page)
344				break;		/* Out of memory */
345		}
346
347		/*
348		 * Associate the page with swap entry in the swap cache.
349		 * May fail (-ENOENT) if swap entry has been freed since
350		 * our caller observed it.  May fail (-EEXIST) if there
351		 * is already a page associated with this entry in the
352		 * swap cache: added by a racing read_swap_cache_async,
353		 * or by try_to_swap_out (or shmem_writepage) re-using
354		 * the just freed swap entry for an existing page.
355		 * May fail (-ENOMEM) if radix-tree node allocation failed.
356		 */
357		err = add_to_swap_cache(new_page, entry);
358		if (!err) {
359			/*
360			 * Initiate read into locked page and return.
361			 */
362			lru_cache_add_active(new_page);
363			swap_readpage(NULL, new_page);
364			return new_page;
365		}
366	} while (err != -ENOENT && err != -ENOMEM);
367
368	if (new_page)
369		page_cache_release(new_page);
370	return found_page;
371}
372