swapfile.c revision e525fd89d380c4a94c0d63913a1dd1a593ed25e7
1/*
2 *  linux/mm/swapfile.c
3 *
4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
5 *  Swap reorganised 29.12.95, Stephen Tweedie
6 */
7
8#include <linux/mm.h>
9#include <linux/hugetlb.h>
10#include <linux/mman.h>
11#include <linux/slab.h>
12#include <linux/kernel_stat.h>
13#include <linux/swap.h>
14#include <linux/vmalloc.h>
15#include <linux/pagemap.h>
16#include <linux/namei.h>
17#include <linux/shm.h>
18#include <linux/blkdev.h>
19#include <linux/random.h>
20#include <linux/writeback.h>
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/init.h>
24#include <linux/module.h>
25#include <linux/ksm.h>
26#include <linux/rmap.h>
27#include <linux/security.h>
28#include <linux/backing-dev.h>
29#include <linux/mutex.h>
30#include <linux/capability.h>
31#include <linux/syscalls.h>
32#include <linux/memcontrol.h>
33#include <linux/poll.h>
34
35#include <asm/pgtable.h>
36#include <asm/tlbflush.h>
37#include <linux/swapops.h>
38#include <linux/page_cgroup.h>
39
40static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
41				 unsigned char);
42static void free_swap_count_continuations(struct swap_info_struct *);
43static sector_t map_swap_entry(swp_entry_t, struct block_device**);
44
45static DEFINE_SPINLOCK(swap_lock);
46static unsigned int nr_swapfiles;
47long nr_swap_pages;
48long total_swap_pages;
49static int least_priority;
50
51static const char Bad_file[] = "Bad swap file entry ";
52static const char Unused_file[] = "Unused swap file entry ";
53static const char Bad_offset[] = "Bad swap offset entry ";
54static const char Unused_offset[] = "Unused swap offset entry ";
55
56static struct swap_list_t swap_list = {-1, -1};
57
58static struct swap_info_struct *swap_info[MAX_SWAPFILES];
59
60static DEFINE_MUTEX(swapon_mutex);
61
62static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
63/* Activity counter to indicate that a swapon or swapoff has occurred */
64static atomic_t proc_poll_event = ATOMIC_INIT(0);
65
66static inline unsigned char swap_count(unsigned char ent)
67{
68	return ent & ~SWAP_HAS_CACHE;	/* may include SWAP_HAS_CONT flag */
69}
70
71/* returns 1 if swap entry is freed */
72static int
73__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
74{
75	swp_entry_t entry = swp_entry(si->type, offset);
76	struct page *page;
77	int ret = 0;
78
79	page = find_get_page(&swapper_space, entry.val);
80	if (!page)
81		return 0;
82	/*
83	 * This function is called from scan_swap_map() and it's called
84	 * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
85	 * We have to use trylock for avoiding deadlock. This is a special
86	 * case and you should use try_to_free_swap() with explicit lock_page()
87	 * in usual operations.
88	 */
89	if (trylock_page(page)) {
90		ret = try_to_free_swap(page);
91		unlock_page(page);
92	}
93	page_cache_release(page);
94	return ret;
95}
96
97/*
98 * We need this because the bdev->unplug_fn can sleep and we cannot
99 * hold swap_lock while calling the unplug_fn. And swap_lock
100 * cannot be turned into a mutex.
101 */
102static DECLARE_RWSEM(swap_unplug_sem);
103
104void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
105{
106	swp_entry_t entry;
107
108	down_read(&swap_unplug_sem);
109	entry.val = page_private(page);
110	if (PageSwapCache(page)) {
111		struct block_device *bdev = swap_info[swp_type(entry)]->bdev;
112		struct backing_dev_info *bdi;
113
114		/*
115		 * If the page is removed from swapcache from under us (with a
116		 * racy try_to_unuse/swapoff) we need an additional reference
117		 * count to avoid reading garbage from page_private(page) above.
118		 * If the WARN_ON triggers during a swapoff it maybe the race
119		 * condition and it's harmless. However if it triggers without
120		 * swapoff it signals a problem.
121		 */
122		WARN_ON(page_count(page) <= 1);
123
124		bdi = bdev->bd_inode->i_mapping->backing_dev_info;
125		blk_run_backing_dev(bdi, page);
126	}
127	up_read(&swap_unplug_sem);
128}
129
130/*
131 * swapon tell device that all the old swap contents can be discarded,
132 * to allow the swap device to optimize its wear-levelling.
133 */
134static int discard_swap(struct swap_info_struct *si)
135{
136	struct swap_extent *se;
137	sector_t start_block;
138	sector_t nr_blocks;
139	int err = 0;
140
141	/* Do not discard the swap header page! */
142	se = &si->first_swap_extent;
143	start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
144	nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
145	if (nr_blocks) {
146		err = blkdev_issue_discard(si->bdev, start_block,
147				nr_blocks, GFP_KERNEL, 0);
148		if (err)
149			return err;
150		cond_resched();
151	}
152
153	list_for_each_entry(se, &si->first_swap_extent.list, list) {
154		start_block = se->start_block << (PAGE_SHIFT - 9);
155		nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
156
157		err = blkdev_issue_discard(si->bdev, start_block,
158				nr_blocks, GFP_KERNEL, 0);
159		if (err)
160			break;
161
162		cond_resched();
163	}
164	return err;		/* That will often be -EOPNOTSUPP */
165}
166
167/*
168 * swap allocation tell device that a cluster of swap can now be discarded,
169 * to allow the swap device to optimize its wear-levelling.
170 */
171static void discard_swap_cluster(struct swap_info_struct *si,
172				 pgoff_t start_page, pgoff_t nr_pages)
173{
174	struct swap_extent *se = si->curr_swap_extent;
175	int found_extent = 0;
176
177	while (nr_pages) {
178		struct list_head *lh;
179
180		if (se->start_page <= start_page &&
181		    start_page < se->start_page + se->nr_pages) {
182			pgoff_t offset = start_page - se->start_page;
183			sector_t start_block = se->start_block + offset;
184			sector_t nr_blocks = se->nr_pages - offset;
185
186			if (nr_blocks > nr_pages)
187				nr_blocks = nr_pages;
188			start_page += nr_blocks;
189			nr_pages -= nr_blocks;
190
191			if (!found_extent++)
192				si->curr_swap_extent = se;
193
194			start_block <<= PAGE_SHIFT - 9;
195			nr_blocks <<= PAGE_SHIFT - 9;
196			if (blkdev_issue_discard(si->bdev, start_block,
197				    nr_blocks, GFP_NOIO, 0))
198				break;
199		}
200
201		lh = se->list.next;
202		se = list_entry(lh, struct swap_extent, list);
203	}
204}
205
206static int wait_for_discard(void *word)
207{
208	schedule();
209	return 0;
210}
211
212#define SWAPFILE_CLUSTER	256
213#define LATENCY_LIMIT		256
214
215static inline unsigned long scan_swap_map(struct swap_info_struct *si,
216					  unsigned char usage)
217{
218	unsigned long offset;
219	unsigned long scan_base;
220	unsigned long last_in_cluster = 0;
221	int latency_ration = LATENCY_LIMIT;
222	int found_free_cluster = 0;
223
224	/*
225	 * We try to cluster swap pages by allocating them sequentially
226	 * in swap.  Once we've allocated SWAPFILE_CLUSTER pages this
227	 * way, however, we resort to first-free allocation, starting
228	 * a new cluster.  This prevents us from scattering swap pages
229	 * all over the entire swap partition, so that we reduce
230	 * overall disk seek times between swap pages.  -- sct
231	 * But we do now try to find an empty cluster.  -Andrea
232	 * And we let swap pages go all over an SSD partition.  Hugh
233	 */
234
235	si->flags += SWP_SCANNING;
236	scan_base = offset = si->cluster_next;
237
238	if (unlikely(!si->cluster_nr--)) {
239		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
240			si->cluster_nr = SWAPFILE_CLUSTER - 1;
241			goto checks;
242		}
243		if (si->flags & SWP_DISCARDABLE) {
244			/*
245			 * Start range check on racing allocations, in case
246			 * they overlap the cluster we eventually decide on
247			 * (we scan without swap_lock to allow preemption).
248			 * It's hardly conceivable that cluster_nr could be
249			 * wrapped during our scan, but don't depend on it.
250			 */
251			if (si->lowest_alloc)
252				goto checks;
253			si->lowest_alloc = si->max;
254			si->highest_alloc = 0;
255		}
256		spin_unlock(&swap_lock);
257
258		/*
259		 * If seek is expensive, start searching for new cluster from
260		 * start of partition, to minimize the span of allocated swap.
261		 * But if seek is cheap, search from our current position, so
262		 * that swap is allocated from all over the partition: if the
263		 * Flash Translation Layer only remaps within limited zones,
264		 * we don't want to wear out the first zone too quickly.
265		 */
266		if (!(si->flags & SWP_SOLIDSTATE))
267			scan_base = offset = si->lowest_bit;
268		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
269
270		/* Locate the first empty (unaligned) cluster */
271		for (; last_in_cluster <= si->highest_bit; offset++) {
272			if (si->swap_map[offset])
273				last_in_cluster = offset + SWAPFILE_CLUSTER;
274			else if (offset == last_in_cluster) {
275				spin_lock(&swap_lock);
276				offset -= SWAPFILE_CLUSTER - 1;
277				si->cluster_next = offset;
278				si->cluster_nr = SWAPFILE_CLUSTER - 1;
279				found_free_cluster = 1;
280				goto checks;
281			}
282			if (unlikely(--latency_ration < 0)) {
283				cond_resched();
284				latency_ration = LATENCY_LIMIT;
285			}
286		}
287
288		offset = si->lowest_bit;
289		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
290
291		/* Locate the first empty (unaligned) cluster */
292		for (; last_in_cluster < scan_base; offset++) {
293			if (si->swap_map[offset])
294				last_in_cluster = offset + SWAPFILE_CLUSTER;
295			else if (offset == last_in_cluster) {
296				spin_lock(&swap_lock);
297				offset -= SWAPFILE_CLUSTER - 1;
298				si->cluster_next = offset;
299				si->cluster_nr = SWAPFILE_CLUSTER - 1;
300				found_free_cluster = 1;
301				goto checks;
302			}
303			if (unlikely(--latency_ration < 0)) {
304				cond_resched();
305				latency_ration = LATENCY_LIMIT;
306			}
307		}
308
309		offset = scan_base;
310		spin_lock(&swap_lock);
311		si->cluster_nr = SWAPFILE_CLUSTER - 1;
312		si->lowest_alloc = 0;
313	}
314
315checks:
316	if (!(si->flags & SWP_WRITEOK))
317		goto no_page;
318	if (!si->highest_bit)
319		goto no_page;
320	if (offset > si->highest_bit)
321		scan_base = offset = si->lowest_bit;
322
323	/* reuse swap entry of cache-only swap if not busy. */
324	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
325		int swap_was_freed;
326		spin_unlock(&swap_lock);
327		swap_was_freed = __try_to_reclaim_swap(si, offset);
328		spin_lock(&swap_lock);
329		/* entry was freed successfully, try to use this again */
330		if (swap_was_freed)
331			goto checks;
332		goto scan; /* check next one */
333	}
334
335	if (si->swap_map[offset])
336		goto scan;
337
338	if (offset == si->lowest_bit)
339		si->lowest_bit++;
340	if (offset == si->highest_bit)
341		si->highest_bit--;
342	si->inuse_pages++;
343	if (si->inuse_pages == si->pages) {
344		si->lowest_bit = si->max;
345		si->highest_bit = 0;
346	}
347	si->swap_map[offset] = usage;
348	si->cluster_next = offset + 1;
349	si->flags -= SWP_SCANNING;
350
351	if (si->lowest_alloc) {
352		/*
353		 * Only set when SWP_DISCARDABLE, and there's a scan
354		 * for a free cluster in progress or just completed.
355		 */
356		if (found_free_cluster) {
357			/*
358			 * To optimize wear-levelling, discard the
359			 * old data of the cluster, taking care not to
360			 * discard any of its pages that have already
361			 * been allocated by racing tasks (offset has
362			 * already stepped over any at the beginning).
363			 */
364			if (offset < si->highest_alloc &&
365			    si->lowest_alloc <= last_in_cluster)
366				last_in_cluster = si->lowest_alloc - 1;
367			si->flags |= SWP_DISCARDING;
368			spin_unlock(&swap_lock);
369
370			if (offset < last_in_cluster)
371				discard_swap_cluster(si, offset,
372					last_in_cluster - offset + 1);
373
374			spin_lock(&swap_lock);
375			si->lowest_alloc = 0;
376			si->flags &= ~SWP_DISCARDING;
377
378			smp_mb();	/* wake_up_bit advises this */
379			wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
380
381		} else if (si->flags & SWP_DISCARDING) {
382			/*
383			 * Delay using pages allocated by racing tasks
384			 * until the whole discard has been issued. We
385			 * could defer that delay until swap_writepage,
386			 * but it's easier to keep this self-contained.
387			 */
388			spin_unlock(&swap_lock);
389			wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
390				wait_for_discard, TASK_UNINTERRUPTIBLE);
391			spin_lock(&swap_lock);
392		} else {
393			/*
394			 * Note pages allocated by racing tasks while
395			 * scan for a free cluster is in progress, so
396			 * that its final discard can exclude them.
397			 */
398			if (offset < si->lowest_alloc)
399				si->lowest_alloc = offset;
400			if (offset > si->highest_alloc)
401				si->highest_alloc = offset;
402		}
403	}
404	return offset;
405
406scan:
407	spin_unlock(&swap_lock);
408	while (++offset <= si->highest_bit) {
409		if (!si->swap_map[offset]) {
410			spin_lock(&swap_lock);
411			goto checks;
412		}
413		if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
414			spin_lock(&swap_lock);
415			goto checks;
416		}
417		if (unlikely(--latency_ration < 0)) {
418			cond_resched();
419			latency_ration = LATENCY_LIMIT;
420		}
421	}
422	offset = si->lowest_bit;
423	while (++offset < scan_base) {
424		if (!si->swap_map[offset]) {
425			spin_lock(&swap_lock);
426			goto checks;
427		}
428		if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
429			spin_lock(&swap_lock);
430			goto checks;
431		}
432		if (unlikely(--latency_ration < 0)) {
433			cond_resched();
434			latency_ration = LATENCY_LIMIT;
435		}
436	}
437	spin_lock(&swap_lock);
438
439no_page:
440	si->flags -= SWP_SCANNING;
441	return 0;
442}
443
444swp_entry_t get_swap_page(void)
445{
446	struct swap_info_struct *si;
447	pgoff_t offset;
448	int type, next;
449	int wrapped = 0;
450
451	spin_lock(&swap_lock);
452	if (nr_swap_pages <= 0)
453		goto noswap;
454	nr_swap_pages--;
455
456	for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
457		si = swap_info[type];
458		next = si->next;
459		if (next < 0 ||
460		    (!wrapped && si->prio != swap_info[next]->prio)) {
461			next = swap_list.head;
462			wrapped++;
463		}
464
465		if (!si->highest_bit)
466			continue;
467		if (!(si->flags & SWP_WRITEOK))
468			continue;
469
470		swap_list.next = next;
471		/* This is called for allocating swap entry for cache */
472		offset = scan_swap_map(si, SWAP_HAS_CACHE);
473		if (offset) {
474			spin_unlock(&swap_lock);
475			return swp_entry(type, offset);
476		}
477		next = swap_list.next;
478	}
479
480	nr_swap_pages++;
481noswap:
482	spin_unlock(&swap_lock);
483	return (swp_entry_t) {0};
484}
485
486/* The only caller of this function is now susupend routine */
487swp_entry_t get_swap_page_of_type(int type)
488{
489	struct swap_info_struct *si;
490	pgoff_t offset;
491
492	spin_lock(&swap_lock);
493	si = swap_info[type];
494	if (si && (si->flags & SWP_WRITEOK)) {
495		nr_swap_pages--;
496		/* This is called for allocating swap entry, not cache */
497		offset = scan_swap_map(si, 1);
498		if (offset) {
499			spin_unlock(&swap_lock);
500			return swp_entry(type, offset);
501		}
502		nr_swap_pages++;
503	}
504	spin_unlock(&swap_lock);
505	return (swp_entry_t) {0};
506}
507
508static struct swap_info_struct *swap_info_get(swp_entry_t entry)
509{
510	struct swap_info_struct *p;
511	unsigned long offset, type;
512
513	if (!entry.val)
514		goto out;
515	type = swp_type(entry);
516	if (type >= nr_swapfiles)
517		goto bad_nofile;
518	p = swap_info[type];
519	if (!(p->flags & SWP_USED))
520		goto bad_device;
521	offset = swp_offset(entry);
522	if (offset >= p->max)
523		goto bad_offset;
524	if (!p->swap_map[offset])
525		goto bad_free;
526	spin_lock(&swap_lock);
527	return p;
528
529bad_free:
530	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
531	goto out;
532bad_offset:
533	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
534	goto out;
535bad_device:
536	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
537	goto out;
538bad_nofile:
539	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
540out:
541	return NULL;
542}
543
544static unsigned char swap_entry_free(struct swap_info_struct *p,
545				     swp_entry_t entry, unsigned char usage)
546{
547	unsigned long offset = swp_offset(entry);
548	unsigned char count;
549	unsigned char has_cache;
550
551	count = p->swap_map[offset];
552	has_cache = count & SWAP_HAS_CACHE;
553	count &= ~SWAP_HAS_CACHE;
554
555	if (usage == SWAP_HAS_CACHE) {
556		VM_BUG_ON(!has_cache);
557		has_cache = 0;
558	} else if (count == SWAP_MAP_SHMEM) {
559		/*
560		 * Or we could insist on shmem.c using a special
561		 * swap_shmem_free() and free_shmem_swap_and_cache()...
562		 */
563		count = 0;
564	} else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
565		if (count == COUNT_CONTINUED) {
566			if (swap_count_continued(p, offset, count))
567				count = SWAP_MAP_MAX | COUNT_CONTINUED;
568			else
569				count = SWAP_MAP_MAX;
570		} else
571			count--;
572	}
573
574	if (!count)
575		mem_cgroup_uncharge_swap(entry);
576
577	usage = count | has_cache;
578	p->swap_map[offset] = usage;
579
580	/* free if no reference */
581	if (!usage) {
582		struct gendisk *disk = p->bdev->bd_disk;
583		if (offset < p->lowest_bit)
584			p->lowest_bit = offset;
585		if (offset > p->highest_bit)
586			p->highest_bit = offset;
587		if (swap_list.next >= 0 &&
588		    p->prio > swap_info[swap_list.next]->prio)
589			swap_list.next = p->type;
590		nr_swap_pages++;
591		p->inuse_pages--;
592		if ((p->flags & SWP_BLKDEV) &&
593				disk->fops->swap_slot_free_notify)
594			disk->fops->swap_slot_free_notify(p->bdev, offset);
595	}
596
597	return usage;
598}
599
600/*
601 * Caller has made sure that the swapdevice corresponding to entry
602 * is still around or has not been recycled.
603 */
604void swap_free(swp_entry_t entry)
605{
606	struct swap_info_struct *p;
607
608	p = swap_info_get(entry);
609	if (p) {
610		swap_entry_free(p, entry, 1);
611		spin_unlock(&swap_lock);
612	}
613}
614
615/*
616 * Called after dropping swapcache to decrease refcnt to swap entries.
617 */
618void swapcache_free(swp_entry_t entry, struct page *page)
619{
620	struct swap_info_struct *p;
621	unsigned char count;
622
623	p = swap_info_get(entry);
624	if (p) {
625		count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
626		if (page)
627			mem_cgroup_uncharge_swapcache(page, entry, count != 0);
628		spin_unlock(&swap_lock);
629	}
630}
631
632/*
633 * How many references to page are currently swapped out?
634 * This does not give an exact answer when swap count is continued,
635 * but does include the high COUNT_CONTINUED flag to allow for that.
636 */
637static inline int page_swapcount(struct page *page)
638{
639	int count = 0;
640	struct swap_info_struct *p;
641	swp_entry_t entry;
642
643	entry.val = page_private(page);
644	p = swap_info_get(entry);
645	if (p) {
646		count = swap_count(p->swap_map[swp_offset(entry)]);
647		spin_unlock(&swap_lock);
648	}
649	return count;
650}
651
652/*
653 * We can write to an anon page without COW if there are no other references
654 * to it.  And as a side-effect, free up its swap: because the old content
655 * on disk will never be read, and seeking back there to write new content
656 * later would only waste time away from clustering.
657 */
658int reuse_swap_page(struct page *page)
659{
660	int count;
661
662	VM_BUG_ON(!PageLocked(page));
663	if (unlikely(PageKsm(page)))
664		return 0;
665	count = page_mapcount(page);
666	if (count <= 1 && PageSwapCache(page)) {
667		count += page_swapcount(page);
668		if (count == 1 && !PageWriteback(page)) {
669			delete_from_swap_cache(page);
670			SetPageDirty(page);
671		}
672	}
673	return count <= 1;
674}
675
676/*
677 * If swap is getting full, or if there are no more mappings of this page,
678 * then try_to_free_swap is called to free its swap space.
679 */
680int try_to_free_swap(struct page *page)
681{
682	VM_BUG_ON(!PageLocked(page));
683
684	if (!PageSwapCache(page))
685		return 0;
686	if (PageWriteback(page))
687		return 0;
688	if (page_swapcount(page))
689		return 0;
690
691	/*
692	 * Once hibernation has begun to create its image of memory,
693	 * there's a danger that one of the calls to try_to_free_swap()
694	 * - most probably a call from __try_to_reclaim_swap() while
695	 * hibernation is allocating its own swap pages for the image,
696	 * but conceivably even a call from memory reclaim - will free
697	 * the swap from a page which has already been recorded in the
698	 * image as a clean swapcache page, and then reuse its swap for
699	 * another page of the image.  On waking from hibernation, the
700	 * original page might be freed under memory pressure, then
701	 * later read back in from swap, now with the wrong data.
702	 *
703	 * Hibernation clears bits from gfp_allowed_mask to prevent
704	 * memory reclaim from writing to disk, so check that here.
705	 */
706	if (!(gfp_allowed_mask & __GFP_IO))
707		return 0;
708
709	delete_from_swap_cache(page);
710	SetPageDirty(page);
711	return 1;
712}
713
714/*
715 * Free the swap entry like above, but also try to
716 * free the page cache entry if it is the last user.
717 */
718int free_swap_and_cache(swp_entry_t entry)
719{
720	struct swap_info_struct *p;
721	struct page *page = NULL;
722
723	if (non_swap_entry(entry))
724		return 1;
725
726	p = swap_info_get(entry);
727	if (p) {
728		if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
729			page = find_get_page(&swapper_space, entry.val);
730			if (page && !trylock_page(page)) {
731				page_cache_release(page);
732				page = NULL;
733			}
734		}
735		spin_unlock(&swap_lock);
736	}
737	if (page) {
738		/*
739		 * Not mapped elsewhere, or swap space full? Free it!
740		 * Also recheck PageSwapCache now page is locked (above).
741		 */
742		if (PageSwapCache(page) && !PageWriteback(page) &&
743				(!page_mapped(page) || vm_swap_full())) {
744			delete_from_swap_cache(page);
745			SetPageDirty(page);
746		}
747		unlock_page(page);
748		page_cache_release(page);
749	}
750	return p != NULL;
751}
752
753#ifdef CONFIG_CGROUP_MEM_RES_CTLR
754/**
755 * mem_cgroup_count_swap_user - count the user of a swap entry
756 * @ent: the swap entry to be checked
757 * @pagep: the pointer for the swap cache page of the entry to be stored
758 *
759 * Returns the number of the user of the swap entry. The number is valid only
760 * for swaps of anonymous pages.
761 * If the entry is found on swap cache, the page is stored to pagep with
762 * refcount of it being incremented.
763 */
764int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
765{
766	struct page *page;
767	struct swap_info_struct *p;
768	int count = 0;
769
770	page = find_get_page(&swapper_space, ent.val);
771	if (page)
772		count += page_mapcount(page);
773	p = swap_info_get(ent);
774	if (p) {
775		count += swap_count(p->swap_map[swp_offset(ent)]);
776		spin_unlock(&swap_lock);
777	}
778
779	*pagep = page;
780	return count;
781}
782#endif
783
784#ifdef CONFIG_HIBERNATION
785/*
786 * Find the swap type that corresponds to given device (if any).
787 *
788 * @offset - number of the PAGE_SIZE-sized block of the device, starting
789 * from 0, in which the swap header is expected to be located.
790 *
791 * This is needed for the suspend to disk (aka swsusp).
792 */
793int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
794{
795	struct block_device *bdev = NULL;
796	int type;
797
798	if (device)
799		bdev = bdget(device);
800
801	spin_lock(&swap_lock);
802	for (type = 0; type < nr_swapfiles; type++) {
803		struct swap_info_struct *sis = swap_info[type];
804
805		if (!(sis->flags & SWP_WRITEOK))
806			continue;
807
808		if (!bdev) {
809			if (bdev_p)
810				*bdev_p = bdgrab(sis->bdev);
811
812			spin_unlock(&swap_lock);
813			return type;
814		}
815		if (bdev == sis->bdev) {
816			struct swap_extent *se = &sis->first_swap_extent;
817
818			if (se->start_block == offset) {
819				if (bdev_p)
820					*bdev_p = bdgrab(sis->bdev);
821
822				spin_unlock(&swap_lock);
823				bdput(bdev);
824				return type;
825			}
826		}
827	}
828	spin_unlock(&swap_lock);
829	if (bdev)
830		bdput(bdev);
831
832	return -ENODEV;
833}
834
835/*
836 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
837 * corresponding to given index in swap_info (swap type).
838 */
839sector_t swapdev_block(int type, pgoff_t offset)
840{
841	struct block_device *bdev;
842
843	if ((unsigned int)type >= nr_swapfiles)
844		return 0;
845	if (!(swap_info[type]->flags & SWP_WRITEOK))
846		return 0;
847	return map_swap_entry(swp_entry(type, offset), &bdev);
848}
849
850/*
851 * Return either the total number of swap pages of given type, or the number
852 * of free pages of that type (depending on @free)
853 *
854 * This is needed for software suspend
855 */
856unsigned int count_swap_pages(int type, int free)
857{
858	unsigned int n = 0;
859
860	spin_lock(&swap_lock);
861	if ((unsigned int)type < nr_swapfiles) {
862		struct swap_info_struct *sis = swap_info[type];
863
864		if (sis->flags & SWP_WRITEOK) {
865			n = sis->pages;
866			if (free)
867				n -= sis->inuse_pages;
868		}
869	}
870	spin_unlock(&swap_lock);
871	return n;
872}
873#endif /* CONFIG_HIBERNATION */
874
875/*
876 * No need to decide whether this PTE shares the swap entry with others,
877 * just let do_wp_page work it out if a write is requested later - to
878 * force COW, vm_page_prot omits write permission from any private vma.
879 */
880static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
881		unsigned long addr, swp_entry_t entry, struct page *page)
882{
883	struct mem_cgroup *ptr = NULL;
884	spinlock_t *ptl;
885	pte_t *pte;
886	int ret = 1;
887
888	if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) {
889		ret = -ENOMEM;
890		goto out_nolock;
891	}
892
893	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
894	if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
895		if (ret > 0)
896			mem_cgroup_cancel_charge_swapin(ptr);
897		ret = 0;
898		goto out;
899	}
900
901	dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
902	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
903	get_page(page);
904	set_pte_at(vma->vm_mm, addr, pte,
905		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
906	page_add_anon_rmap(page, vma, addr);
907	mem_cgroup_commit_charge_swapin(page, ptr);
908	swap_free(entry);
909	/*
910	 * Move the page to the active list so it is not
911	 * immediately swapped out again after swapon.
912	 */
913	activate_page(page);
914out:
915	pte_unmap_unlock(pte, ptl);
916out_nolock:
917	return ret;
918}
919
920static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
921				unsigned long addr, unsigned long end,
922				swp_entry_t entry, struct page *page)
923{
924	pte_t swp_pte = swp_entry_to_pte(entry);
925	pte_t *pte;
926	int ret = 0;
927
928	/*
929	 * We don't actually need pte lock while scanning for swp_pte: since
930	 * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
931	 * page table while we're scanning; though it could get zapped, and on
932	 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
933	 * of unmatched parts which look like swp_pte, so unuse_pte must
934	 * recheck under pte lock.  Scanning without pte lock lets it be
935	 * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
936	 */
937	pte = pte_offset_map(pmd, addr);
938	do {
939		/*
940		 * swapoff spends a _lot_ of time in this loop!
941		 * Test inline before going to call unuse_pte.
942		 */
943		if (unlikely(pte_same(*pte, swp_pte))) {
944			pte_unmap(pte);
945			ret = unuse_pte(vma, pmd, addr, entry, page);
946			if (ret)
947				goto out;
948			pte = pte_offset_map(pmd, addr);
949		}
950	} while (pte++, addr += PAGE_SIZE, addr != end);
951	pte_unmap(pte - 1);
952out:
953	return ret;
954}
955
956static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
957				unsigned long addr, unsigned long end,
958				swp_entry_t entry, struct page *page)
959{
960	pmd_t *pmd;
961	unsigned long next;
962	int ret;
963
964	pmd = pmd_offset(pud, addr);
965	do {
966		next = pmd_addr_end(addr, end);
967		if (pmd_none_or_clear_bad(pmd))
968			continue;
969		ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
970		if (ret)
971			return ret;
972	} while (pmd++, addr = next, addr != end);
973	return 0;
974}
975
976static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
977				unsigned long addr, unsigned long end,
978				swp_entry_t entry, struct page *page)
979{
980	pud_t *pud;
981	unsigned long next;
982	int ret;
983
984	pud = pud_offset(pgd, addr);
985	do {
986		next = pud_addr_end(addr, end);
987		if (pud_none_or_clear_bad(pud))
988			continue;
989		ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
990		if (ret)
991			return ret;
992	} while (pud++, addr = next, addr != end);
993	return 0;
994}
995
996static int unuse_vma(struct vm_area_struct *vma,
997				swp_entry_t entry, struct page *page)
998{
999	pgd_t *pgd;
1000	unsigned long addr, end, next;
1001	int ret;
1002
1003	if (page_anon_vma(page)) {
1004		addr = page_address_in_vma(page, vma);
1005		if (addr == -EFAULT)
1006			return 0;
1007		else
1008			end = addr + PAGE_SIZE;
1009	} else {
1010		addr = vma->vm_start;
1011		end = vma->vm_end;
1012	}
1013
1014	pgd = pgd_offset(vma->vm_mm, addr);
1015	do {
1016		next = pgd_addr_end(addr, end);
1017		if (pgd_none_or_clear_bad(pgd))
1018			continue;
1019		ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
1020		if (ret)
1021			return ret;
1022	} while (pgd++, addr = next, addr != end);
1023	return 0;
1024}
1025
1026static int unuse_mm(struct mm_struct *mm,
1027				swp_entry_t entry, struct page *page)
1028{
1029	struct vm_area_struct *vma;
1030	int ret = 0;
1031
1032	if (!down_read_trylock(&mm->mmap_sem)) {
1033		/*
1034		 * Activate page so shrink_inactive_list is unlikely to unmap
1035		 * its ptes while lock is dropped, so swapoff can make progress.
1036		 */
1037		activate_page(page);
1038		unlock_page(page);
1039		down_read(&mm->mmap_sem);
1040		lock_page(page);
1041	}
1042	for (vma = mm->mmap; vma; vma = vma->vm_next) {
1043		if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
1044			break;
1045	}
1046	up_read(&mm->mmap_sem);
1047	return (ret < 0)? ret: 0;
1048}
1049
1050/*
1051 * Scan swap_map from current position to next entry still in use.
1052 * Recycle to start on reaching the end, returning 0 when empty.
1053 */
1054static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1055					unsigned int prev)
1056{
1057	unsigned int max = si->max;
1058	unsigned int i = prev;
1059	unsigned char count;
1060
1061	/*
1062	 * No need for swap_lock here: we're just looking
1063	 * for whether an entry is in use, not modifying it; false
1064	 * hits are okay, and sys_swapoff() has already prevented new
1065	 * allocations from this area (while holding swap_lock).
1066	 */
1067	for (;;) {
1068		if (++i >= max) {
1069			if (!prev) {
1070				i = 0;
1071				break;
1072			}
1073			/*
1074			 * No entries in use at top of swap_map,
1075			 * loop back to start and recheck there.
1076			 */
1077			max = prev + 1;
1078			prev = 0;
1079			i = 1;
1080		}
1081		count = si->swap_map[i];
1082		if (count && swap_count(count) != SWAP_MAP_BAD)
1083			break;
1084	}
1085	return i;
1086}
1087
1088/*
1089 * We completely avoid races by reading each swap page in advance,
1090 * and then search for the process using it.  All the necessary
1091 * page table adjustments can then be made atomically.
1092 */
1093static int try_to_unuse(unsigned int type)
1094{
1095	struct swap_info_struct *si = swap_info[type];
1096	struct mm_struct *start_mm;
1097	unsigned char *swap_map;
1098	unsigned char swcount;
1099	struct page *page;
1100	swp_entry_t entry;
1101	unsigned int i = 0;
1102	int retval = 0;
1103
1104	/*
1105	 * When searching mms for an entry, a good strategy is to
1106	 * start at the first mm we freed the previous entry from
1107	 * (though actually we don't notice whether we or coincidence
1108	 * freed the entry).  Initialize this start_mm with a hold.
1109	 *
1110	 * A simpler strategy would be to start at the last mm we
1111	 * freed the previous entry from; but that would take less
1112	 * advantage of mmlist ordering, which clusters forked mms
1113	 * together, child after parent.  If we race with dup_mmap(), we
1114	 * prefer to resolve parent before child, lest we miss entries
1115	 * duplicated after we scanned child: using last mm would invert
1116	 * that.
1117	 */
1118	start_mm = &init_mm;
1119	atomic_inc(&init_mm.mm_users);
1120
1121	/*
1122	 * Keep on scanning until all entries have gone.  Usually,
1123	 * one pass through swap_map is enough, but not necessarily:
1124	 * there are races when an instance of an entry might be missed.
1125	 */
1126	while ((i = find_next_to_unuse(si, i)) != 0) {
1127		if (signal_pending(current)) {
1128			retval = -EINTR;
1129			break;
1130		}
1131
1132		/*
1133		 * Get a page for the entry, using the existing swap
1134		 * cache page if there is one.  Otherwise, get a clean
1135		 * page and read the swap into it.
1136		 */
1137		swap_map = &si->swap_map[i];
1138		entry = swp_entry(type, i);
1139		page = read_swap_cache_async(entry,
1140					GFP_HIGHUSER_MOVABLE, NULL, 0);
1141		if (!page) {
1142			/*
1143			 * Either swap_duplicate() failed because entry
1144			 * has been freed independently, and will not be
1145			 * reused since sys_swapoff() already disabled
1146			 * allocation from here, or alloc_page() failed.
1147			 */
1148			if (!*swap_map)
1149				continue;
1150			retval = -ENOMEM;
1151			break;
1152		}
1153
1154		/*
1155		 * Don't hold on to start_mm if it looks like exiting.
1156		 */
1157		if (atomic_read(&start_mm->mm_users) == 1) {
1158			mmput(start_mm);
1159			start_mm = &init_mm;
1160			atomic_inc(&init_mm.mm_users);
1161		}
1162
1163		/*
1164		 * Wait for and lock page.  When do_swap_page races with
1165		 * try_to_unuse, do_swap_page can handle the fault much
1166		 * faster than try_to_unuse can locate the entry.  This
1167		 * apparently redundant "wait_on_page_locked" lets try_to_unuse
1168		 * defer to do_swap_page in such a case - in some tests,
1169		 * do_swap_page and try_to_unuse repeatedly compete.
1170		 */
1171		wait_on_page_locked(page);
1172		wait_on_page_writeback(page);
1173		lock_page(page);
1174		wait_on_page_writeback(page);
1175
1176		/*
1177		 * Remove all references to entry.
1178		 */
1179		swcount = *swap_map;
1180		if (swap_count(swcount) == SWAP_MAP_SHMEM) {
1181			retval = shmem_unuse(entry, page);
1182			/* page has already been unlocked and released */
1183			if (retval < 0)
1184				break;
1185			continue;
1186		}
1187		if (swap_count(swcount) && start_mm != &init_mm)
1188			retval = unuse_mm(start_mm, entry, page);
1189
1190		if (swap_count(*swap_map)) {
1191			int set_start_mm = (*swap_map >= swcount);
1192			struct list_head *p = &start_mm->mmlist;
1193			struct mm_struct *new_start_mm = start_mm;
1194			struct mm_struct *prev_mm = start_mm;
1195			struct mm_struct *mm;
1196
1197			atomic_inc(&new_start_mm->mm_users);
1198			atomic_inc(&prev_mm->mm_users);
1199			spin_lock(&mmlist_lock);
1200			while (swap_count(*swap_map) && !retval &&
1201					(p = p->next) != &start_mm->mmlist) {
1202				mm = list_entry(p, struct mm_struct, mmlist);
1203				if (!atomic_inc_not_zero(&mm->mm_users))
1204					continue;
1205				spin_unlock(&mmlist_lock);
1206				mmput(prev_mm);
1207				prev_mm = mm;
1208
1209				cond_resched();
1210
1211				swcount = *swap_map;
1212				if (!swap_count(swcount)) /* any usage ? */
1213					;
1214				else if (mm == &init_mm)
1215					set_start_mm = 1;
1216				else
1217					retval = unuse_mm(mm, entry, page);
1218
1219				if (set_start_mm && *swap_map < swcount) {
1220					mmput(new_start_mm);
1221					atomic_inc(&mm->mm_users);
1222					new_start_mm = mm;
1223					set_start_mm = 0;
1224				}
1225				spin_lock(&mmlist_lock);
1226			}
1227			spin_unlock(&mmlist_lock);
1228			mmput(prev_mm);
1229			mmput(start_mm);
1230			start_mm = new_start_mm;
1231		}
1232		if (retval) {
1233			unlock_page(page);
1234			page_cache_release(page);
1235			break;
1236		}
1237
1238		/*
1239		 * If a reference remains (rare), we would like to leave
1240		 * the page in the swap cache; but try_to_unmap could
1241		 * then re-duplicate the entry once we drop page lock,
1242		 * so we might loop indefinitely; also, that page could
1243		 * not be swapped out to other storage meanwhile.  So:
1244		 * delete from cache even if there's another reference,
1245		 * after ensuring that the data has been saved to disk -
1246		 * since if the reference remains (rarer), it will be
1247		 * read from disk into another page.  Splitting into two
1248		 * pages would be incorrect if swap supported "shared
1249		 * private" pages, but they are handled by tmpfs files.
1250		 *
1251		 * Given how unuse_vma() targets one particular offset
1252		 * in an anon_vma, once the anon_vma has been determined,
1253		 * this splitting happens to be just what is needed to
1254		 * handle where KSM pages have been swapped out: re-reading
1255		 * is unnecessarily slow, but we can fix that later on.
1256		 */
1257		if (swap_count(*swap_map) &&
1258		     PageDirty(page) && PageSwapCache(page)) {
1259			struct writeback_control wbc = {
1260				.sync_mode = WB_SYNC_NONE,
1261			};
1262
1263			swap_writepage(page, &wbc);
1264			lock_page(page);
1265			wait_on_page_writeback(page);
1266		}
1267
1268		/*
1269		 * It is conceivable that a racing task removed this page from
1270		 * swap cache just before we acquired the page lock at the top,
1271		 * or while we dropped it in unuse_mm().  The page might even
1272		 * be back in swap cache on another swap area: that we must not
1273		 * delete, since it may not have been written out to swap yet.
1274		 */
1275		if (PageSwapCache(page) &&
1276		    likely(page_private(page) == entry.val))
1277			delete_from_swap_cache(page);
1278
1279		/*
1280		 * So we could skip searching mms once swap count went
1281		 * to 1, we did not mark any present ptes as dirty: must
1282		 * mark page dirty so shrink_page_list will preserve it.
1283		 */
1284		SetPageDirty(page);
1285		unlock_page(page);
1286		page_cache_release(page);
1287
1288		/*
1289		 * Make sure that we aren't completely killing
1290		 * interactive performance.
1291		 */
1292		cond_resched();
1293	}
1294
1295	mmput(start_mm);
1296	return retval;
1297}
1298
1299/*
1300 * After a successful try_to_unuse, if no swap is now in use, we know
1301 * we can empty the mmlist.  swap_lock must be held on entry and exit.
1302 * Note that mmlist_lock nests inside swap_lock, and an mm must be
1303 * added to the mmlist just after page_duplicate - before would be racy.
1304 */
1305static void drain_mmlist(void)
1306{
1307	struct list_head *p, *next;
1308	unsigned int type;
1309
1310	for (type = 0; type < nr_swapfiles; type++)
1311		if (swap_info[type]->inuse_pages)
1312			return;
1313	spin_lock(&mmlist_lock);
1314	list_for_each_safe(p, next, &init_mm.mmlist)
1315		list_del_init(p);
1316	spin_unlock(&mmlist_lock);
1317}
1318
1319/*
1320 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
1321 * corresponds to page offset for the specified swap entry.
1322 * Note that the type of this function is sector_t, but it returns page offset
1323 * into the bdev, not sector offset.
1324 */
1325static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
1326{
1327	struct swap_info_struct *sis;
1328	struct swap_extent *start_se;
1329	struct swap_extent *se;
1330	pgoff_t offset;
1331
1332	sis = swap_info[swp_type(entry)];
1333	*bdev = sis->bdev;
1334
1335	offset = swp_offset(entry);
1336	start_se = sis->curr_swap_extent;
1337	se = start_se;
1338
1339	for ( ; ; ) {
1340		struct list_head *lh;
1341
1342		if (se->start_page <= offset &&
1343				offset < (se->start_page + se->nr_pages)) {
1344			return se->start_block + (offset - se->start_page);
1345		}
1346		lh = se->list.next;
1347		se = list_entry(lh, struct swap_extent, list);
1348		sis->curr_swap_extent = se;
1349		BUG_ON(se == start_se);		/* It *must* be present */
1350	}
1351}
1352
1353/*
1354 * Returns the page offset into bdev for the specified page's swap entry.
1355 */
1356sector_t map_swap_page(struct page *page, struct block_device **bdev)
1357{
1358	swp_entry_t entry;
1359	entry.val = page_private(page);
1360	return map_swap_entry(entry, bdev);
1361}
1362
1363/*
1364 * Free all of a swapdev's extent information
1365 */
1366static void destroy_swap_extents(struct swap_info_struct *sis)
1367{
1368	while (!list_empty(&sis->first_swap_extent.list)) {
1369		struct swap_extent *se;
1370
1371		se = list_entry(sis->first_swap_extent.list.next,
1372				struct swap_extent, list);
1373		list_del(&se->list);
1374		kfree(se);
1375	}
1376}
1377
1378/*
1379 * Add a block range (and the corresponding page range) into this swapdev's
1380 * extent list.  The extent list is kept sorted in page order.
1381 *
1382 * This function rather assumes that it is called in ascending page order.
1383 */
1384static int
1385add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1386		unsigned long nr_pages, sector_t start_block)
1387{
1388	struct swap_extent *se;
1389	struct swap_extent *new_se;
1390	struct list_head *lh;
1391
1392	if (start_page == 0) {
1393		se = &sis->first_swap_extent;
1394		sis->curr_swap_extent = se;
1395		se->start_page = 0;
1396		se->nr_pages = nr_pages;
1397		se->start_block = start_block;
1398		return 1;
1399	} else {
1400		lh = sis->first_swap_extent.list.prev;	/* Highest extent */
1401		se = list_entry(lh, struct swap_extent, list);
1402		BUG_ON(se->start_page + se->nr_pages != start_page);
1403		if (se->start_block + se->nr_pages == start_block) {
1404			/* Merge it */
1405			se->nr_pages += nr_pages;
1406			return 0;
1407		}
1408	}
1409
1410	/*
1411	 * No merge.  Insert a new extent, preserving ordering.
1412	 */
1413	new_se = kmalloc(sizeof(*se), GFP_KERNEL);
1414	if (new_se == NULL)
1415		return -ENOMEM;
1416	new_se->start_page = start_page;
1417	new_se->nr_pages = nr_pages;
1418	new_se->start_block = start_block;
1419
1420	list_add_tail(&new_se->list, &sis->first_swap_extent.list);
1421	return 1;
1422}
1423
1424/*
1425 * A `swap extent' is a simple thing which maps a contiguous range of pages
1426 * onto a contiguous range of disk blocks.  An ordered list of swap extents
1427 * is built at swapon time and is then used at swap_writepage/swap_readpage
1428 * time for locating where on disk a page belongs.
1429 *
1430 * If the swapfile is an S_ISBLK block device, a single extent is installed.
1431 * This is done so that the main operating code can treat S_ISBLK and S_ISREG
1432 * swap files identically.
1433 *
1434 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
1435 * extent list operates in PAGE_SIZE disk blocks.  Both S_ISREG and S_ISBLK
1436 * swapfiles are handled *identically* after swapon time.
1437 *
1438 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
1439 * and will parse them into an ordered extent list, in PAGE_SIZE chunks.  If
1440 * some stray blocks are found which do not fall within the PAGE_SIZE alignment
1441 * requirements, they are simply tossed out - we will never use those blocks
1442 * for swapping.
1443 *
1444 * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon.  This
1445 * prevents root from shooting her foot off by ftruncating an in-use swapfile,
1446 * which will scribble on the fs.
1447 *
1448 * The amount of disk space which a single swap extent represents varies.
1449 * Typically it is in the 1-4 megabyte range.  So we can have hundreds of
1450 * extents in the list.  To avoid much list walking, we cache the previous
1451 * search location in `curr_swap_extent', and start new searches from there.
1452 * This is extremely effective.  The average number of iterations in
1453 * map_swap_page() has been measured at about 0.3 per page.  - akpm.
1454 */
1455static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1456{
1457	struct inode *inode;
1458	unsigned blocks_per_page;
1459	unsigned long page_no;
1460	unsigned blkbits;
1461	sector_t probe_block;
1462	sector_t last_block;
1463	sector_t lowest_block = -1;
1464	sector_t highest_block = 0;
1465	int nr_extents = 0;
1466	int ret;
1467
1468	inode = sis->swap_file->f_mapping->host;
1469	if (S_ISBLK(inode->i_mode)) {
1470		ret = add_swap_extent(sis, 0, sis->max, 0);
1471		*span = sis->pages;
1472		goto out;
1473	}
1474
1475	blkbits = inode->i_blkbits;
1476	blocks_per_page = PAGE_SIZE >> blkbits;
1477
1478	/*
1479	 * Map all the blocks into the extent list.  This code doesn't try
1480	 * to be very smart.
1481	 */
1482	probe_block = 0;
1483	page_no = 0;
1484	last_block = i_size_read(inode) >> blkbits;
1485	while ((probe_block + blocks_per_page) <= last_block &&
1486			page_no < sis->max) {
1487		unsigned block_in_page;
1488		sector_t first_block;
1489
1490		first_block = bmap(inode, probe_block);
1491		if (first_block == 0)
1492			goto bad_bmap;
1493
1494		/*
1495		 * It must be PAGE_SIZE aligned on-disk
1496		 */
1497		if (first_block & (blocks_per_page - 1)) {
1498			probe_block++;
1499			goto reprobe;
1500		}
1501
1502		for (block_in_page = 1; block_in_page < blocks_per_page;
1503					block_in_page++) {
1504			sector_t block;
1505
1506			block = bmap(inode, probe_block + block_in_page);
1507			if (block == 0)
1508				goto bad_bmap;
1509			if (block != first_block + block_in_page) {
1510				/* Discontiguity */
1511				probe_block++;
1512				goto reprobe;
1513			}
1514		}
1515
1516		first_block >>= (PAGE_SHIFT - blkbits);
1517		if (page_no) {	/* exclude the header page */
1518			if (first_block < lowest_block)
1519				lowest_block = first_block;
1520			if (first_block > highest_block)
1521				highest_block = first_block;
1522		}
1523
1524		/*
1525		 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
1526		 */
1527		ret = add_swap_extent(sis, page_no, 1, first_block);
1528		if (ret < 0)
1529			goto out;
1530		nr_extents += ret;
1531		page_no++;
1532		probe_block += blocks_per_page;
1533reprobe:
1534		continue;
1535	}
1536	ret = nr_extents;
1537	*span = 1 + highest_block - lowest_block;
1538	if (page_no == 0)
1539		page_no = 1;	/* force Empty message */
1540	sis->max = page_no;
1541	sis->pages = page_no - 1;
1542	sis->highest_bit = page_no - 1;
1543out:
1544	return ret;
1545bad_bmap:
1546	printk(KERN_ERR "swapon: swapfile has holes\n");
1547	ret = -EINVAL;
1548	goto out;
1549}
1550
1551SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1552{
1553	struct swap_info_struct *p = NULL;
1554	unsigned char *swap_map;
1555	struct file *swap_file, *victim;
1556	struct address_space *mapping;
1557	struct inode *inode;
1558	char *pathname;
1559	int i, type, prev;
1560	int err;
1561
1562	if (!capable(CAP_SYS_ADMIN))
1563		return -EPERM;
1564
1565	pathname = getname(specialfile);
1566	err = PTR_ERR(pathname);
1567	if (IS_ERR(pathname))
1568		goto out;
1569
1570	victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
1571	putname(pathname);
1572	err = PTR_ERR(victim);
1573	if (IS_ERR(victim))
1574		goto out;
1575
1576	mapping = victim->f_mapping;
1577	prev = -1;
1578	spin_lock(&swap_lock);
1579	for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
1580		p = swap_info[type];
1581		if (p->flags & SWP_WRITEOK) {
1582			if (p->swap_file->f_mapping == mapping)
1583				break;
1584		}
1585		prev = type;
1586	}
1587	if (type < 0) {
1588		err = -EINVAL;
1589		spin_unlock(&swap_lock);
1590		goto out_dput;
1591	}
1592	if (!security_vm_enough_memory(p->pages))
1593		vm_unacct_memory(p->pages);
1594	else {
1595		err = -ENOMEM;
1596		spin_unlock(&swap_lock);
1597		goto out_dput;
1598	}
1599	if (prev < 0)
1600		swap_list.head = p->next;
1601	else
1602		swap_info[prev]->next = p->next;
1603	if (type == swap_list.next) {
1604		/* just pick something that's safe... */
1605		swap_list.next = swap_list.head;
1606	}
1607	if (p->prio < 0) {
1608		for (i = p->next; i >= 0; i = swap_info[i]->next)
1609			swap_info[i]->prio = p->prio--;
1610		least_priority++;
1611	}
1612	nr_swap_pages -= p->pages;
1613	total_swap_pages -= p->pages;
1614	p->flags &= ~SWP_WRITEOK;
1615	spin_unlock(&swap_lock);
1616
1617	current->flags |= PF_OOM_ORIGIN;
1618	err = try_to_unuse(type);
1619	current->flags &= ~PF_OOM_ORIGIN;
1620
1621	if (err) {
1622		/* re-insert swap space back into swap_list */
1623		spin_lock(&swap_lock);
1624		if (p->prio < 0)
1625			p->prio = --least_priority;
1626		prev = -1;
1627		for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1628			if (p->prio >= swap_info[i]->prio)
1629				break;
1630			prev = i;
1631		}
1632		p->next = i;
1633		if (prev < 0)
1634			swap_list.head = swap_list.next = type;
1635		else
1636			swap_info[prev]->next = type;
1637		nr_swap_pages += p->pages;
1638		total_swap_pages += p->pages;
1639		p->flags |= SWP_WRITEOK;
1640		spin_unlock(&swap_lock);
1641		goto out_dput;
1642	}
1643
1644	/* wait for any unplug function to finish */
1645	down_write(&swap_unplug_sem);
1646	up_write(&swap_unplug_sem);
1647
1648	destroy_swap_extents(p);
1649	if (p->flags & SWP_CONTINUED)
1650		free_swap_count_continuations(p);
1651
1652	mutex_lock(&swapon_mutex);
1653	spin_lock(&swap_lock);
1654	drain_mmlist();
1655
1656	/* wait for anyone still in scan_swap_map */
1657	p->highest_bit = 0;		/* cuts scans short */
1658	while (p->flags >= SWP_SCANNING) {
1659		spin_unlock(&swap_lock);
1660		schedule_timeout_uninterruptible(1);
1661		spin_lock(&swap_lock);
1662	}
1663
1664	swap_file = p->swap_file;
1665	p->swap_file = NULL;
1666	p->max = 0;
1667	swap_map = p->swap_map;
1668	p->swap_map = NULL;
1669	p->flags = 0;
1670	spin_unlock(&swap_lock);
1671	mutex_unlock(&swapon_mutex);
1672	vfree(swap_map);
1673	/* Destroy swap account informatin */
1674	swap_cgroup_swapoff(type);
1675
1676	inode = mapping->host;
1677	if (S_ISBLK(inode->i_mode)) {
1678		struct block_device *bdev = I_BDEV(inode);
1679		set_blocksize(bdev, p->old_block_size);
1680		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1681	} else {
1682		mutex_lock(&inode->i_mutex);
1683		inode->i_flags &= ~S_SWAPFILE;
1684		mutex_unlock(&inode->i_mutex);
1685	}
1686	filp_close(swap_file, NULL);
1687	err = 0;
1688	atomic_inc(&proc_poll_event);
1689	wake_up_interruptible(&proc_poll_wait);
1690
1691out_dput:
1692	filp_close(victim, NULL);
1693out:
1694	return err;
1695}
1696
1697#ifdef CONFIG_PROC_FS
1698struct proc_swaps {
1699	struct seq_file seq;
1700	int event;
1701};
1702
1703static unsigned swaps_poll(struct file *file, poll_table *wait)
1704{
1705	struct proc_swaps *s = file->private_data;
1706
1707	poll_wait(file, &proc_poll_wait, wait);
1708
1709	if (s->event != atomic_read(&proc_poll_event)) {
1710		s->event = atomic_read(&proc_poll_event);
1711		return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
1712	}
1713
1714	return POLLIN | POLLRDNORM;
1715}
1716
1717/* iterator */
1718static void *swap_start(struct seq_file *swap, loff_t *pos)
1719{
1720	struct swap_info_struct *si;
1721	int type;
1722	loff_t l = *pos;
1723
1724	mutex_lock(&swapon_mutex);
1725
1726	if (!l)
1727		return SEQ_START_TOKEN;
1728
1729	for (type = 0; type < nr_swapfiles; type++) {
1730		smp_rmb();	/* read nr_swapfiles before swap_info[type] */
1731		si = swap_info[type];
1732		if (!(si->flags & SWP_USED) || !si->swap_map)
1733			continue;
1734		if (!--l)
1735			return si;
1736	}
1737
1738	return NULL;
1739}
1740
1741static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1742{
1743	struct swap_info_struct *si = v;
1744	int type;
1745
1746	if (v == SEQ_START_TOKEN)
1747		type = 0;
1748	else
1749		type = si->type + 1;
1750
1751	for (; type < nr_swapfiles; type++) {
1752		smp_rmb();	/* read nr_swapfiles before swap_info[type] */
1753		si = swap_info[type];
1754		if (!(si->flags & SWP_USED) || !si->swap_map)
1755			continue;
1756		++*pos;
1757		return si;
1758	}
1759
1760	return NULL;
1761}
1762
1763static void swap_stop(struct seq_file *swap, void *v)
1764{
1765	mutex_unlock(&swapon_mutex);
1766}
1767
1768static int swap_show(struct seq_file *swap, void *v)
1769{
1770	struct swap_info_struct *si = v;
1771	struct file *file;
1772	int len;
1773
1774	if (si == SEQ_START_TOKEN) {
1775		seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1776		return 0;
1777	}
1778
1779	file = si->swap_file;
1780	len = seq_path(swap, &file->f_path, " \t\n\\");
1781	seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1782			len < 40 ? 40 - len : 1, " ",
1783			S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1784				"partition" : "file\t",
1785			si->pages << (PAGE_SHIFT - 10),
1786			si->inuse_pages << (PAGE_SHIFT - 10),
1787			si->prio);
1788	return 0;
1789}
1790
1791static const struct seq_operations swaps_op = {
1792	.start =	swap_start,
1793	.next =		swap_next,
1794	.stop =		swap_stop,
1795	.show =		swap_show
1796};
1797
1798static int swaps_open(struct inode *inode, struct file *file)
1799{
1800	struct proc_swaps *s;
1801	int ret;
1802
1803	s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
1804	if (!s)
1805		return -ENOMEM;
1806
1807	file->private_data = s;
1808
1809	ret = seq_open(file, &swaps_op);
1810	if (ret) {
1811		kfree(s);
1812		return ret;
1813	}
1814
1815	s->seq.private = s;
1816	s->event = atomic_read(&proc_poll_event);
1817	return ret;
1818}
1819
1820static const struct file_operations proc_swaps_operations = {
1821	.open		= swaps_open,
1822	.read		= seq_read,
1823	.llseek		= seq_lseek,
1824	.release	= seq_release,
1825	.poll		= swaps_poll,
1826};
1827
1828static int __init procswaps_init(void)
1829{
1830	proc_create("swaps", 0, NULL, &proc_swaps_operations);
1831	return 0;
1832}
1833__initcall(procswaps_init);
1834#endif /* CONFIG_PROC_FS */
1835
1836#ifdef MAX_SWAPFILES_CHECK
1837static int __init max_swapfiles_check(void)
1838{
1839	MAX_SWAPFILES_CHECK();
1840	return 0;
1841}
1842late_initcall(max_swapfiles_check);
1843#endif
1844
1845/*
1846 * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
1847 *
1848 * The swapon system call
1849 */
1850SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1851{
1852	struct swap_info_struct *p;
1853	char *name = NULL;
1854	struct block_device *bdev = NULL;
1855	struct file *swap_file = NULL;
1856	struct address_space *mapping;
1857	unsigned int type;
1858	int i, prev;
1859	int error;
1860	union swap_header *swap_header;
1861	unsigned int nr_good_pages;
1862	int nr_extents = 0;
1863	sector_t span;
1864	unsigned long maxpages;
1865	unsigned long swapfilepages;
1866	unsigned char *swap_map = NULL;
1867	struct page *page = NULL;
1868	struct inode *inode = NULL;
1869	int did_down = 0;
1870
1871	if (!capable(CAP_SYS_ADMIN))
1872		return -EPERM;
1873
1874	p = kzalloc(sizeof(*p), GFP_KERNEL);
1875	if (!p)
1876		return -ENOMEM;
1877
1878	spin_lock(&swap_lock);
1879	for (type = 0; type < nr_swapfiles; type++) {
1880		if (!(swap_info[type]->flags & SWP_USED))
1881			break;
1882	}
1883	error = -EPERM;
1884	if (type >= MAX_SWAPFILES) {
1885		spin_unlock(&swap_lock);
1886		kfree(p);
1887		goto out;
1888	}
1889	if (type >= nr_swapfiles) {
1890		p->type = type;
1891		swap_info[type] = p;
1892		/*
1893		 * Write swap_info[type] before nr_swapfiles, in case a
1894		 * racing procfs swap_start() or swap_next() is reading them.
1895		 * (We never shrink nr_swapfiles, we never free this entry.)
1896		 */
1897		smp_wmb();
1898		nr_swapfiles++;
1899	} else {
1900		kfree(p);
1901		p = swap_info[type];
1902		/*
1903		 * Do not memset this entry: a racing procfs swap_next()
1904		 * would be relying on p->type to remain valid.
1905		 */
1906	}
1907	INIT_LIST_HEAD(&p->first_swap_extent.list);
1908	p->flags = SWP_USED;
1909	p->next = -1;
1910	spin_unlock(&swap_lock);
1911
1912	name = getname(specialfile);
1913	error = PTR_ERR(name);
1914	if (IS_ERR(name)) {
1915		name = NULL;
1916		goto bad_swap_2;
1917	}
1918	swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
1919	error = PTR_ERR(swap_file);
1920	if (IS_ERR(swap_file)) {
1921		swap_file = NULL;
1922		goto bad_swap_2;
1923	}
1924
1925	p->swap_file = swap_file;
1926	mapping = swap_file->f_mapping;
1927	inode = mapping->host;
1928
1929	error = -EBUSY;
1930	for (i = 0; i < nr_swapfiles; i++) {
1931		struct swap_info_struct *q = swap_info[i];
1932
1933		if (i == type || !q->swap_file)
1934			continue;
1935		if (mapping == q->swap_file->f_mapping)
1936			goto bad_swap;
1937	}
1938
1939	error = -EINVAL;
1940	if (S_ISBLK(inode->i_mode)) {
1941		bdev = I_BDEV(inode);
1942		error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1943				   sys_swapon);
1944		if (error < 0) {
1945			bdev = NULL;
1946			error = -EINVAL;
1947			goto bad_swap;
1948		}
1949		p->old_block_size = block_size(bdev);
1950		error = set_blocksize(bdev, PAGE_SIZE);
1951		if (error < 0)
1952			goto bad_swap;
1953		p->bdev = bdev;
1954		p->flags |= SWP_BLKDEV;
1955	} else if (S_ISREG(inode->i_mode)) {
1956		p->bdev = inode->i_sb->s_bdev;
1957		mutex_lock(&inode->i_mutex);
1958		did_down = 1;
1959		if (IS_SWAPFILE(inode)) {
1960			error = -EBUSY;
1961			goto bad_swap;
1962		}
1963	} else {
1964		goto bad_swap;
1965	}
1966
1967	swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1968
1969	/*
1970	 * Read the swap header.
1971	 */
1972	if (!mapping->a_ops->readpage) {
1973		error = -EINVAL;
1974		goto bad_swap;
1975	}
1976	page = read_mapping_page(mapping, 0, swap_file);
1977	if (IS_ERR(page)) {
1978		error = PTR_ERR(page);
1979		goto bad_swap;
1980	}
1981	swap_header = kmap(page);
1982
1983	if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1984		printk(KERN_ERR "Unable to find swap-space signature\n");
1985		error = -EINVAL;
1986		goto bad_swap;
1987	}
1988
1989	/* swap partition endianess hack... */
1990	if (swab32(swap_header->info.version) == 1) {
1991		swab32s(&swap_header->info.version);
1992		swab32s(&swap_header->info.last_page);
1993		swab32s(&swap_header->info.nr_badpages);
1994		for (i = 0; i < swap_header->info.nr_badpages; i++)
1995			swab32s(&swap_header->info.badpages[i]);
1996	}
1997	/* Check the swap header's sub-version */
1998	if (swap_header->info.version != 1) {
1999		printk(KERN_WARNING
2000		       "Unable to handle swap header version %d\n",
2001		       swap_header->info.version);
2002		error = -EINVAL;
2003		goto bad_swap;
2004	}
2005
2006	p->lowest_bit  = 1;
2007	p->cluster_next = 1;
2008	p->cluster_nr = 0;
2009
2010	/*
2011	 * Find out how many pages are allowed for a single swap
2012	 * device. There are two limiting factors: 1) the number of
2013	 * bits for the swap offset in the swp_entry_t type and
2014	 * 2) the number of bits in the a swap pte as defined by
2015	 * the different architectures. In order to find the
2016	 * largest possible bit mask a swap entry with swap type 0
2017	 * and swap offset ~0UL is created, encoded to a swap pte,
2018	 * decoded to a swp_entry_t again and finally the swap
2019	 * offset is extracted. This will mask all the bits from
2020	 * the initial ~0UL mask that can't be encoded in either
2021	 * the swp_entry_t or the architecture definition of a
2022	 * swap pte.
2023	 */
2024	maxpages = swp_offset(pte_to_swp_entry(
2025			swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
2026	if (maxpages > swap_header->info.last_page) {
2027		maxpages = swap_header->info.last_page + 1;
2028		/* p->max is an unsigned int: don't overflow it */
2029		if ((unsigned int)maxpages == 0)
2030			maxpages = UINT_MAX;
2031	}
2032	p->highest_bit = maxpages - 1;
2033
2034	error = -EINVAL;
2035	if (!maxpages)
2036		goto bad_swap;
2037	if (swapfilepages && maxpages > swapfilepages) {
2038		printk(KERN_WARNING
2039		       "Swap area shorter than signature indicates\n");
2040		goto bad_swap;
2041	}
2042	if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
2043		goto bad_swap;
2044	if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2045		goto bad_swap;
2046
2047	/* OK, set up the swap map and apply the bad block list */
2048	swap_map = vmalloc(maxpages);
2049	if (!swap_map) {
2050		error = -ENOMEM;
2051		goto bad_swap;
2052	}
2053
2054	memset(swap_map, 0, maxpages);
2055	nr_good_pages = maxpages - 1;	/* omit header page */
2056
2057	for (i = 0; i < swap_header->info.nr_badpages; i++) {
2058		unsigned int page_nr = swap_header->info.badpages[i];
2059		if (page_nr == 0 || page_nr > swap_header->info.last_page) {
2060			error = -EINVAL;
2061			goto bad_swap;
2062		}
2063		if (page_nr < maxpages) {
2064			swap_map[page_nr] = SWAP_MAP_BAD;
2065			nr_good_pages--;
2066		}
2067	}
2068
2069	error = swap_cgroup_swapon(type, maxpages);
2070	if (error)
2071		goto bad_swap;
2072
2073	if (nr_good_pages) {
2074		swap_map[0] = SWAP_MAP_BAD;
2075		p->max = maxpages;
2076		p->pages = nr_good_pages;
2077		nr_extents = setup_swap_extents(p, &span);
2078		if (nr_extents < 0) {
2079			error = nr_extents;
2080			goto bad_swap;
2081		}
2082		nr_good_pages = p->pages;
2083	}
2084	if (!nr_good_pages) {
2085		printk(KERN_WARNING "Empty swap-file\n");
2086		error = -EINVAL;
2087		goto bad_swap;
2088	}
2089
2090	if (p->bdev) {
2091		if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
2092			p->flags |= SWP_SOLIDSTATE;
2093			p->cluster_next = 1 + (random32() % p->highest_bit);
2094		}
2095		if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD))
2096			p->flags |= SWP_DISCARDABLE;
2097	}
2098
2099	mutex_lock(&swapon_mutex);
2100	spin_lock(&swap_lock);
2101	if (swap_flags & SWAP_FLAG_PREFER)
2102		p->prio =
2103		  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2104	else
2105		p->prio = --least_priority;
2106	p->swap_map = swap_map;
2107	p->flags |= SWP_WRITEOK;
2108	nr_swap_pages += nr_good_pages;
2109	total_swap_pages += nr_good_pages;
2110
2111	printk(KERN_INFO "Adding %uk swap on %s.  "
2112			"Priority:%d extents:%d across:%lluk %s%s\n",
2113		nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
2114		nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2115		(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2116		(p->flags & SWP_DISCARDABLE) ? "D" : "");
2117
2118	/* insert swap space into swap_list: */
2119	prev = -1;
2120	for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
2121		if (p->prio >= swap_info[i]->prio)
2122			break;
2123		prev = i;
2124	}
2125	p->next = i;
2126	if (prev < 0)
2127		swap_list.head = swap_list.next = type;
2128	else
2129		swap_info[prev]->next = type;
2130	spin_unlock(&swap_lock);
2131	mutex_unlock(&swapon_mutex);
2132	atomic_inc(&proc_poll_event);
2133	wake_up_interruptible(&proc_poll_wait);
2134
2135	error = 0;
2136	goto out;
2137bad_swap:
2138	if (bdev) {
2139		set_blocksize(bdev, p->old_block_size);
2140		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2141	}
2142	destroy_swap_extents(p);
2143	swap_cgroup_swapoff(type);
2144bad_swap_2:
2145	spin_lock(&swap_lock);
2146	p->swap_file = NULL;
2147	p->flags = 0;
2148	spin_unlock(&swap_lock);
2149	vfree(swap_map);
2150	if (swap_file)
2151		filp_close(swap_file, NULL);
2152out:
2153	if (page && !IS_ERR(page)) {
2154		kunmap(page);
2155		page_cache_release(page);
2156	}
2157	if (name)
2158		putname(name);
2159	if (did_down) {
2160		if (!error)
2161			inode->i_flags |= S_SWAPFILE;
2162		mutex_unlock(&inode->i_mutex);
2163	}
2164	return error;
2165}
2166
2167void si_swapinfo(struct sysinfo *val)
2168{
2169	unsigned int type;
2170	unsigned long nr_to_be_unused = 0;
2171
2172	spin_lock(&swap_lock);
2173	for (type = 0; type < nr_swapfiles; type++) {
2174		struct swap_info_struct *si = swap_info[type];
2175
2176		if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2177			nr_to_be_unused += si->inuse_pages;
2178	}
2179	val->freeswap = nr_swap_pages + nr_to_be_unused;
2180	val->totalswap = total_swap_pages + nr_to_be_unused;
2181	spin_unlock(&swap_lock);
2182}
2183
2184/*
2185 * Verify that a swap entry is valid and increment its swap map count.
2186 *
2187 * Returns error code in following case.
2188 * - success -> 0
2189 * - swp_entry is invalid -> EINVAL
2190 * - swp_entry is migration entry -> EINVAL
2191 * - swap-cache reference is requested but there is already one. -> EEXIST
2192 * - swap-cache reference is requested but the entry is not used. -> ENOENT
2193 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
2194 */
2195static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2196{
2197	struct swap_info_struct *p;
2198	unsigned long offset, type;
2199	unsigned char count;
2200	unsigned char has_cache;
2201	int err = -EINVAL;
2202
2203	if (non_swap_entry(entry))
2204		goto out;
2205
2206	type = swp_type(entry);
2207	if (type >= nr_swapfiles)
2208		goto bad_file;
2209	p = swap_info[type];
2210	offset = swp_offset(entry);
2211
2212	spin_lock(&swap_lock);
2213	if (unlikely(offset >= p->max))
2214		goto unlock_out;
2215
2216	count = p->swap_map[offset];
2217	has_cache = count & SWAP_HAS_CACHE;
2218	count &= ~SWAP_HAS_CACHE;
2219	err = 0;
2220
2221	if (usage == SWAP_HAS_CACHE) {
2222
2223		/* set SWAP_HAS_CACHE if there is no cache and entry is used */
2224		if (!has_cache && count)
2225			has_cache = SWAP_HAS_CACHE;
2226		else if (has_cache)		/* someone else added cache */
2227			err = -EEXIST;
2228		else				/* no users remaining */
2229			err = -ENOENT;
2230
2231	} else if (count || has_cache) {
2232
2233		if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2234			count += usage;
2235		else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2236			err = -EINVAL;
2237		else if (swap_count_continued(p, offset, count))
2238			count = COUNT_CONTINUED;
2239		else
2240			err = -ENOMEM;
2241	} else
2242		err = -ENOENT;			/* unused swap entry */
2243
2244	p->swap_map[offset] = count | has_cache;
2245
2246unlock_out:
2247	spin_unlock(&swap_lock);
2248out:
2249	return err;
2250
2251bad_file:
2252	printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
2253	goto out;
2254}
2255
2256/*
2257 * Help swapoff by noting that swap entry belongs to shmem/tmpfs
2258 * (in which case its reference count is never incremented).
2259 */
2260void swap_shmem_alloc(swp_entry_t entry)
2261{
2262	__swap_duplicate(entry, SWAP_MAP_SHMEM);
2263}
2264
2265/*
2266 * Increase reference count of swap entry by 1.
2267 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
2268 * but could not be atomically allocated.  Returns 0, just as if it succeeded,
2269 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
2270 * might occur if a page table entry has got corrupted.
2271 */
2272int swap_duplicate(swp_entry_t entry)
2273{
2274	int err = 0;
2275
2276	while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
2277		err = add_swap_count_continuation(entry, GFP_ATOMIC);
2278	return err;
2279}
2280
2281/*
2282 * @entry: swap entry for which we allocate swap cache.
2283 *
2284 * Called when allocating swap cache for existing swap entry,
2285 * This can return error codes. Returns 0 at success.
2286 * -EBUSY means there is a swap cache.
2287 * Note: return code is different from swap_duplicate().
2288 */
2289int swapcache_prepare(swp_entry_t entry)
2290{
2291	return __swap_duplicate(entry, SWAP_HAS_CACHE);
2292}
2293
2294/*
2295 * swap_lock prevents swap_map being freed. Don't grab an extra
2296 * reference on the swaphandle, it doesn't matter if it becomes unused.
2297 */
2298int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2299{
2300	struct swap_info_struct *si;
2301	int our_page_cluster = page_cluster;
2302	pgoff_t target, toff;
2303	pgoff_t base, end;
2304	int nr_pages = 0;
2305
2306	if (!our_page_cluster)	/* no readahead */
2307		return 0;
2308
2309	si = swap_info[swp_type(entry)];
2310	target = swp_offset(entry);
2311	base = (target >> our_page_cluster) << our_page_cluster;
2312	end = base + (1 << our_page_cluster);
2313	if (!base)		/* first page is swap header */
2314		base++;
2315
2316	spin_lock(&swap_lock);
2317	if (end > si->max)	/* don't go beyond end of map */
2318		end = si->max;
2319
2320	/* Count contiguous allocated slots above our target */
2321	for (toff = target; ++toff < end; nr_pages++) {
2322		/* Don't read in free or bad pages */
2323		if (!si->swap_map[toff])
2324			break;
2325		if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2326			break;
2327	}
2328	/* Count contiguous allocated slots below our target */
2329	for (toff = target; --toff >= base; nr_pages++) {
2330		/* Don't read in free or bad pages */
2331		if (!si->swap_map[toff])
2332			break;
2333		if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2334			break;
2335	}
2336	spin_unlock(&swap_lock);
2337
2338	/*
2339	 * Indicate starting offset, and return number of pages to get:
2340	 * if only 1, say 0, since there's then no readahead to be done.
2341	 */
2342	*offset = ++toff;
2343	return nr_pages? ++nr_pages: 0;
2344}
2345
2346/*
2347 * add_swap_count_continuation - called when a swap count is duplicated
2348 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
2349 * page of the original vmalloc'ed swap_map, to hold the continuation count
2350 * (for that entry and for its neighbouring PAGE_SIZE swap entries).  Called
2351 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
2352 *
2353 * These continuation pages are seldom referenced: the common paths all work
2354 * on the original swap_map, only referring to a continuation page when the
2355 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
2356 *
2357 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
2358 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
2359 * can be called after dropping locks.
2360 */
2361int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2362{
2363	struct swap_info_struct *si;
2364	struct page *head;
2365	struct page *page;
2366	struct page *list_page;
2367	pgoff_t offset;
2368	unsigned char count;
2369
2370	/*
2371	 * When debugging, it's easier to use __GFP_ZERO here; but it's better
2372	 * for latency not to zero a page while GFP_ATOMIC and holding locks.
2373	 */
2374	page = alloc_page(gfp_mask | __GFP_HIGHMEM);
2375
2376	si = swap_info_get(entry);
2377	if (!si) {
2378		/*
2379		 * An acceptable race has occurred since the failing
2380		 * __swap_duplicate(): the swap entry has been freed,
2381		 * perhaps even the whole swap_map cleared for swapoff.
2382		 */
2383		goto outer;
2384	}
2385
2386	offset = swp_offset(entry);
2387	count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
2388
2389	if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
2390		/*
2391		 * The higher the swap count, the more likely it is that tasks
2392		 * will race to add swap count continuation: we need to avoid
2393		 * over-provisioning.
2394		 */
2395		goto out;
2396	}
2397
2398	if (!page) {
2399		spin_unlock(&swap_lock);
2400		return -ENOMEM;
2401	}
2402
2403	/*
2404	 * We are fortunate that although vmalloc_to_page uses pte_offset_map,
2405	 * no architecture is using highmem pages for kernel pagetables: so it
2406	 * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps.
2407	 */
2408	head = vmalloc_to_page(si->swap_map + offset);
2409	offset &= ~PAGE_MASK;
2410
2411	/*
2412	 * Page allocation does not initialize the page's lru field,
2413	 * but it does always reset its private field.
2414	 */
2415	if (!page_private(head)) {
2416		BUG_ON(count & COUNT_CONTINUED);
2417		INIT_LIST_HEAD(&head->lru);
2418		set_page_private(head, SWP_CONTINUED);
2419		si->flags |= SWP_CONTINUED;
2420	}
2421
2422	list_for_each_entry(list_page, &head->lru, lru) {
2423		unsigned char *map;
2424
2425		/*
2426		 * If the previous map said no continuation, but we've found
2427		 * a continuation page, free our allocation and use this one.
2428		 */
2429		if (!(count & COUNT_CONTINUED))
2430			goto out;
2431
2432		map = kmap_atomic(list_page, KM_USER0) + offset;
2433		count = *map;
2434		kunmap_atomic(map, KM_USER0);
2435
2436		/*
2437		 * If this continuation count now has some space in it,
2438		 * free our allocation and use this one.
2439		 */
2440		if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2441			goto out;
2442	}
2443
2444	list_add_tail(&page->lru, &head->lru);
2445	page = NULL;			/* now it's attached, don't free it */
2446out:
2447	spin_unlock(&swap_lock);
2448outer:
2449	if (page)
2450		__free_page(page);
2451	return 0;
2452}
2453
2454/*
2455 * swap_count_continued - when the original swap_map count is incremented
2456 * from SWAP_MAP_MAX, check if there is already a continuation page to carry
2457 * into, carry if so, or else fail until a new continuation page is allocated;
2458 * when the original swap_map count is decremented from 0 with continuation,
2459 * borrow from the continuation and report whether it still holds more.
2460 * Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
2461 */
2462static bool swap_count_continued(struct swap_info_struct *si,
2463				 pgoff_t offset, unsigned char count)
2464{
2465	struct page *head;
2466	struct page *page;
2467	unsigned char *map;
2468
2469	head = vmalloc_to_page(si->swap_map + offset);
2470	if (page_private(head) != SWP_CONTINUED) {
2471		BUG_ON(count & COUNT_CONTINUED);
2472		return false;		/* need to add count continuation */
2473	}
2474
2475	offset &= ~PAGE_MASK;
2476	page = list_entry(head->lru.next, struct page, lru);
2477	map = kmap_atomic(page, KM_USER0) + offset;
2478
2479	if (count == SWAP_MAP_MAX)	/* initial increment from swap_map */
2480		goto init_map;		/* jump over SWAP_CONT_MAX checks */
2481
2482	if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
2483		/*
2484		 * Think of how you add 1 to 999
2485		 */
2486		while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2487			kunmap_atomic(map, KM_USER0);
2488			page = list_entry(page->lru.next, struct page, lru);
2489			BUG_ON(page == head);
2490			map = kmap_atomic(page, KM_USER0) + offset;
2491		}
2492		if (*map == SWAP_CONT_MAX) {
2493			kunmap_atomic(map, KM_USER0);
2494			page = list_entry(page->lru.next, struct page, lru);
2495			if (page == head)
2496				return false;	/* add count continuation */
2497			map = kmap_atomic(page, KM_USER0) + offset;
2498init_map:		*map = 0;		/* we didn't zero the page */
2499		}
2500		*map += 1;
2501		kunmap_atomic(map, KM_USER0);
2502		page = list_entry(page->lru.prev, struct page, lru);
2503		while (page != head) {
2504			map = kmap_atomic(page, KM_USER0) + offset;
2505			*map = COUNT_CONTINUED;
2506			kunmap_atomic(map, KM_USER0);
2507			page = list_entry(page->lru.prev, struct page, lru);
2508		}
2509		return true;			/* incremented */
2510
2511	} else {				/* decrementing */
2512		/*
2513		 * Think of how you subtract 1 from 1000
2514		 */
2515		BUG_ON(count != COUNT_CONTINUED);
2516		while (*map == COUNT_CONTINUED) {
2517			kunmap_atomic(map, KM_USER0);
2518			page = list_entry(page->lru.next, struct page, lru);
2519			BUG_ON(page == head);
2520			map = kmap_atomic(page, KM_USER0) + offset;
2521		}
2522		BUG_ON(*map == 0);
2523		*map -= 1;
2524		if (*map == 0)
2525			count = 0;
2526		kunmap_atomic(map, KM_USER0);
2527		page = list_entry(page->lru.prev, struct page, lru);
2528		while (page != head) {
2529			map = kmap_atomic(page, KM_USER0) + offset;
2530			*map = SWAP_CONT_MAX | count;
2531			count = COUNT_CONTINUED;
2532			kunmap_atomic(map, KM_USER0);
2533			page = list_entry(page->lru.prev, struct page, lru);
2534		}
2535		return count == COUNT_CONTINUED;
2536	}
2537}
2538
2539/*
2540 * free_swap_count_continuations - swapoff free all the continuation pages
2541 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
2542 */
2543static void free_swap_count_continuations(struct swap_info_struct *si)
2544{
2545	pgoff_t offset;
2546
2547	for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
2548		struct page *head;
2549		head = vmalloc_to_page(si->swap_map + offset);
2550		if (page_private(head)) {
2551			struct list_head *this, *next;
2552			list_for_each_safe(this, next, &head->lru) {
2553				struct page *page;
2554				page = list_entry(this, struct page, lru);
2555				list_del(this);
2556				__free_page(page);
2557			}
2558		}
2559	}
2560}
2561