swapfile.c revision 52c50567d8ab0a0a87f12cceaa4194967854f0bd
1/*
2 *  linux/mm/swapfile.c
3 *
4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
5 *  Swap reorganised 29.12.95, Stephen Tweedie
6 */
7
8#include <linux/mm.h>
9#include <linux/hugetlb.h>
10#include <linux/mman.h>
11#include <linux/slab.h>
12#include <linux/kernel_stat.h>
13#include <linux/swap.h>
14#include <linux/vmalloc.h>
15#include <linux/pagemap.h>
16#include <linux/namei.h>
17#include <linux/shm.h>
18#include <linux/blkdev.h>
19#include <linux/random.h>
20#include <linux/writeback.h>
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/init.h>
24#include <linux/module.h>
25#include <linux/ksm.h>
26#include <linux/rmap.h>
27#include <linux/security.h>
28#include <linux/backing-dev.h>
29#include <linux/mutex.h>
30#include <linux/capability.h>
31#include <linux/syscalls.h>
32#include <linux/memcontrol.h>
33#include <linux/poll.h>
34
35#include <asm/pgtable.h>
36#include <asm/tlbflush.h>
37#include <linux/swapops.h>
38#include <linux/page_cgroup.h>
39
40static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
41				 unsigned char);
42static void free_swap_count_continuations(struct swap_info_struct *);
43static sector_t map_swap_entry(swp_entry_t, struct block_device**);
44
45static DEFINE_SPINLOCK(swap_lock);
46static unsigned int nr_swapfiles;
47long nr_swap_pages;
48long total_swap_pages;
49static int least_priority;
50
51static const char Bad_file[] = "Bad swap file entry ";
52static const char Unused_file[] = "Unused swap file entry ";
53static const char Bad_offset[] = "Bad swap offset entry ";
54static const char Unused_offset[] = "Unused swap offset entry ";
55
56static struct swap_list_t swap_list = {-1, -1};
57
58static struct swap_info_struct *swap_info[MAX_SWAPFILES];
59
60static DEFINE_MUTEX(swapon_mutex);
61
62static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
63/* Activity counter to indicate that a swapon or swapoff has occurred */
64static atomic_t proc_poll_event = ATOMIC_INIT(0);
65
66static inline unsigned char swap_count(unsigned char ent)
67{
68	return ent & ~SWAP_HAS_CACHE;	/* may include SWAP_HAS_CONT flag */
69}
70
71/* returns 1 if swap entry is freed */
72static int
73__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
74{
75	swp_entry_t entry = swp_entry(si->type, offset);
76	struct page *page;
77	int ret = 0;
78
79	page = find_get_page(&swapper_space, entry.val);
80	if (!page)
81		return 0;
82	/*
83	 * This function is called from scan_swap_map() and it's called
84	 * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
85	 * We have to use trylock for avoiding deadlock. This is a special
86	 * case and you should use try_to_free_swap() with explicit lock_page()
87	 * in usual operations.
88	 */
89	if (trylock_page(page)) {
90		ret = try_to_free_swap(page);
91		unlock_page(page);
92	}
93	page_cache_release(page);
94	return ret;
95}
96
97/*
98 * We need this because the bdev->unplug_fn can sleep and we cannot
99 * hold swap_lock while calling the unplug_fn. And swap_lock
100 * cannot be turned into a mutex.
101 */
102static DECLARE_RWSEM(swap_unplug_sem);
103
104void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
105{
106	swp_entry_t entry;
107
108	down_read(&swap_unplug_sem);
109	entry.val = page_private(page);
110	if (PageSwapCache(page)) {
111		struct block_device *bdev = swap_info[swp_type(entry)]->bdev;
112		struct backing_dev_info *bdi;
113
114		/*
115		 * If the page is removed from swapcache from under us (with a
116		 * racy try_to_unuse/swapoff) we need an additional reference
117		 * count to avoid reading garbage from page_private(page) above.
118		 * If the WARN_ON triggers during a swapoff it maybe the race
119		 * condition and it's harmless. However if it triggers without
120		 * swapoff it signals a problem.
121		 */
122		WARN_ON(page_count(page) <= 1);
123
124		bdi = bdev->bd_inode->i_mapping->backing_dev_info;
125		blk_run_backing_dev(bdi, page);
126	}
127	up_read(&swap_unplug_sem);
128}
129
130/*
131 * swapon tell device that all the old swap contents can be discarded,
132 * to allow the swap device to optimize its wear-levelling.
133 */
134static int discard_swap(struct swap_info_struct *si)
135{
136	struct swap_extent *se;
137	sector_t start_block;
138	sector_t nr_blocks;
139	int err = 0;
140
141	/* Do not discard the swap header page! */
142	se = &si->first_swap_extent;
143	start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
144	nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
145	if (nr_blocks) {
146		err = blkdev_issue_discard(si->bdev, start_block,
147				nr_blocks, GFP_KERNEL, 0);
148		if (err)
149			return err;
150		cond_resched();
151	}
152
153	list_for_each_entry(se, &si->first_swap_extent.list, list) {
154		start_block = se->start_block << (PAGE_SHIFT - 9);
155		nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
156
157		err = blkdev_issue_discard(si->bdev, start_block,
158				nr_blocks, GFP_KERNEL, 0);
159		if (err)
160			break;
161
162		cond_resched();
163	}
164	return err;		/* That will often be -EOPNOTSUPP */
165}
166
167/*
168 * swap allocation tell device that a cluster of swap can now be discarded,
169 * to allow the swap device to optimize its wear-levelling.
170 */
171static void discard_swap_cluster(struct swap_info_struct *si,
172				 pgoff_t start_page, pgoff_t nr_pages)
173{
174	struct swap_extent *se = si->curr_swap_extent;
175	int found_extent = 0;
176
177	while (nr_pages) {
178		struct list_head *lh;
179
180		if (se->start_page <= start_page &&
181		    start_page < se->start_page + se->nr_pages) {
182			pgoff_t offset = start_page - se->start_page;
183			sector_t start_block = se->start_block + offset;
184			sector_t nr_blocks = se->nr_pages - offset;
185
186			if (nr_blocks > nr_pages)
187				nr_blocks = nr_pages;
188			start_page += nr_blocks;
189			nr_pages -= nr_blocks;
190
191			if (!found_extent++)
192				si->curr_swap_extent = se;
193
194			start_block <<= PAGE_SHIFT - 9;
195			nr_blocks <<= PAGE_SHIFT - 9;
196			if (blkdev_issue_discard(si->bdev, start_block,
197				    nr_blocks, GFP_NOIO, 0))
198				break;
199		}
200
201		lh = se->list.next;
202		se = list_entry(lh, struct swap_extent, list);
203	}
204}
205
206static int wait_for_discard(void *word)
207{
208	schedule();
209	return 0;
210}
211
212#define SWAPFILE_CLUSTER	256
213#define LATENCY_LIMIT		256
214
215static inline unsigned long scan_swap_map(struct swap_info_struct *si,
216					  unsigned char usage)
217{
218	unsigned long offset;
219	unsigned long scan_base;
220	unsigned long last_in_cluster = 0;
221	int latency_ration = LATENCY_LIMIT;
222	int found_free_cluster = 0;
223
224	/*
225	 * We try to cluster swap pages by allocating them sequentially
226	 * in swap.  Once we've allocated SWAPFILE_CLUSTER pages this
227	 * way, however, we resort to first-free allocation, starting
228	 * a new cluster.  This prevents us from scattering swap pages
229	 * all over the entire swap partition, so that we reduce
230	 * overall disk seek times between swap pages.  -- sct
231	 * But we do now try to find an empty cluster.  -Andrea
232	 * And we let swap pages go all over an SSD partition.  Hugh
233	 */
234
235	si->flags += SWP_SCANNING;
236	scan_base = offset = si->cluster_next;
237
238	if (unlikely(!si->cluster_nr--)) {
239		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
240			si->cluster_nr = SWAPFILE_CLUSTER - 1;
241			goto checks;
242		}
243		if (si->flags & SWP_DISCARDABLE) {
244			/*
245			 * Start range check on racing allocations, in case
246			 * they overlap the cluster we eventually decide on
247			 * (we scan without swap_lock to allow preemption).
248			 * It's hardly conceivable that cluster_nr could be
249			 * wrapped during our scan, but don't depend on it.
250			 */
251			if (si->lowest_alloc)
252				goto checks;
253			si->lowest_alloc = si->max;
254			si->highest_alloc = 0;
255		}
256		spin_unlock(&swap_lock);
257
258		/*
259		 * If seek is expensive, start searching for new cluster from
260		 * start of partition, to minimize the span of allocated swap.
261		 * But if seek is cheap, search from our current position, so
262		 * that swap is allocated from all over the partition: if the
263		 * Flash Translation Layer only remaps within limited zones,
264		 * we don't want to wear out the first zone too quickly.
265		 */
266		if (!(si->flags & SWP_SOLIDSTATE))
267			scan_base = offset = si->lowest_bit;
268		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
269
270		/* Locate the first empty (unaligned) cluster */
271		for (; last_in_cluster <= si->highest_bit; offset++) {
272			if (si->swap_map[offset])
273				last_in_cluster = offset + SWAPFILE_CLUSTER;
274			else if (offset == last_in_cluster) {
275				spin_lock(&swap_lock);
276				offset -= SWAPFILE_CLUSTER - 1;
277				si->cluster_next = offset;
278				si->cluster_nr = SWAPFILE_CLUSTER - 1;
279				found_free_cluster = 1;
280				goto checks;
281			}
282			if (unlikely(--latency_ration < 0)) {
283				cond_resched();
284				latency_ration = LATENCY_LIMIT;
285			}
286		}
287
288		offset = si->lowest_bit;
289		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
290
291		/* Locate the first empty (unaligned) cluster */
292		for (; last_in_cluster < scan_base; offset++) {
293			if (si->swap_map[offset])
294				last_in_cluster = offset + SWAPFILE_CLUSTER;
295			else if (offset == last_in_cluster) {
296				spin_lock(&swap_lock);
297				offset -= SWAPFILE_CLUSTER - 1;
298				si->cluster_next = offset;
299				si->cluster_nr = SWAPFILE_CLUSTER - 1;
300				found_free_cluster = 1;
301				goto checks;
302			}
303			if (unlikely(--latency_ration < 0)) {
304				cond_resched();
305				latency_ration = LATENCY_LIMIT;
306			}
307		}
308
309		offset = scan_base;
310		spin_lock(&swap_lock);
311		si->cluster_nr = SWAPFILE_CLUSTER - 1;
312		si->lowest_alloc = 0;
313	}
314
315checks:
316	if (!(si->flags & SWP_WRITEOK))
317		goto no_page;
318	if (!si->highest_bit)
319		goto no_page;
320	if (offset > si->highest_bit)
321		scan_base = offset = si->lowest_bit;
322
323	/* reuse swap entry of cache-only swap if not busy. */
324	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
325		int swap_was_freed;
326		spin_unlock(&swap_lock);
327		swap_was_freed = __try_to_reclaim_swap(si, offset);
328		spin_lock(&swap_lock);
329		/* entry was freed successfully, try to use this again */
330		if (swap_was_freed)
331			goto checks;
332		goto scan; /* check next one */
333	}
334
335	if (si->swap_map[offset])
336		goto scan;
337
338	if (offset == si->lowest_bit)
339		si->lowest_bit++;
340	if (offset == si->highest_bit)
341		si->highest_bit--;
342	si->inuse_pages++;
343	if (si->inuse_pages == si->pages) {
344		si->lowest_bit = si->max;
345		si->highest_bit = 0;
346	}
347	si->swap_map[offset] = usage;
348	si->cluster_next = offset + 1;
349	si->flags -= SWP_SCANNING;
350
351	if (si->lowest_alloc) {
352		/*
353		 * Only set when SWP_DISCARDABLE, and there's a scan
354		 * for a free cluster in progress or just completed.
355		 */
356		if (found_free_cluster) {
357			/*
358			 * To optimize wear-levelling, discard the
359			 * old data of the cluster, taking care not to
360			 * discard any of its pages that have already
361			 * been allocated by racing tasks (offset has
362			 * already stepped over any at the beginning).
363			 */
364			if (offset < si->highest_alloc &&
365			    si->lowest_alloc <= last_in_cluster)
366				last_in_cluster = si->lowest_alloc - 1;
367			si->flags |= SWP_DISCARDING;
368			spin_unlock(&swap_lock);
369
370			if (offset < last_in_cluster)
371				discard_swap_cluster(si, offset,
372					last_in_cluster - offset + 1);
373
374			spin_lock(&swap_lock);
375			si->lowest_alloc = 0;
376			si->flags &= ~SWP_DISCARDING;
377
378			smp_mb();	/* wake_up_bit advises this */
379			wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
380
381		} else if (si->flags & SWP_DISCARDING) {
382			/*
383			 * Delay using pages allocated by racing tasks
384			 * until the whole discard has been issued. We
385			 * could defer that delay until swap_writepage,
386			 * but it's easier to keep this self-contained.
387			 */
388			spin_unlock(&swap_lock);
389			wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
390				wait_for_discard, TASK_UNINTERRUPTIBLE);
391			spin_lock(&swap_lock);
392		} else {
393			/*
394			 * Note pages allocated by racing tasks while
395			 * scan for a free cluster is in progress, so
396			 * that its final discard can exclude them.
397			 */
398			if (offset < si->lowest_alloc)
399				si->lowest_alloc = offset;
400			if (offset > si->highest_alloc)
401				si->highest_alloc = offset;
402		}
403	}
404	return offset;
405
406scan:
407	spin_unlock(&swap_lock);
408	while (++offset <= si->highest_bit) {
409		if (!si->swap_map[offset]) {
410			spin_lock(&swap_lock);
411			goto checks;
412		}
413		if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
414			spin_lock(&swap_lock);
415			goto checks;
416		}
417		if (unlikely(--latency_ration < 0)) {
418			cond_resched();
419			latency_ration = LATENCY_LIMIT;
420		}
421	}
422	offset = si->lowest_bit;
423	while (++offset < scan_base) {
424		if (!si->swap_map[offset]) {
425			spin_lock(&swap_lock);
426			goto checks;
427		}
428		if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
429			spin_lock(&swap_lock);
430			goto checks;
431		}
432		if (unlikely(--latency_ration < 0)) {
433			cond_resched();
434			latency_ration = LATENCY_LIMIT;
435		}
436	}
437	spin_lock(&swap_lock);
438
439no_page:
440	si->flags -= SWP_SCANNING;
441	return 0;
442}
443
444swp_entry_t get_swap_page(void)
445{
446	struct swap_info_struct *si;
447	pgoff_t offset;
448	int type, next;
449	int wrapped = 0;
450
451	spin_lock(&swap_lock);
452	if (nr_swap_pages <= 0)
453		goto noswap;
454	nr_swap_pages--;
455
456	for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
457		si = swap_info[type];
458		next = si->next;
459		if (next < 0 ||
460		    (!wrapped && si->prio != swap_info[next]->prio)) {
461			next = swap_list.head;
462			wrapped++;
463		}
464
465		if (!si->highest_bit)
466			continue;
467		if (!(si->flags & SWP_WRITEOK))
468			continue;
469
470		swap_list.next = next;
471		/* This is called for allocating swap entry for cache */
472		offset = scan_swap_map(si, SWAP_HAS_CACHE);
473		if (offset) {
474			spin_unlock(&swap_lock);
475			return swp_entry(type, offset);
476		}
477		next = swap_list.next;
478	}
479
480	nr_swap_pages++;
481noswap:
482	spin_unlock(&swap_lock);
483	return (swp_entry_t) {0};
484}
485
486/* The only caller of this function is now susupend routine */
487swp_entry_t get_swap_page_of_type(int type)
488{
489	struct swap_info_struct *si;
490	pgoff_t offset;
491
492	spin_lock(&swap_lock);
493	si = swap_info[type];
494	if (si && (si->flags & SWP_WRITEOK)) {
495		nr_swap_pages--;
496		/* This is called for allocating swap entry, not cache */
497		offset = scan_swap_map(si, 1);
498		if (offset) {
499			spin_unlock(&swap_lock);
500			return swp_entry(type, offset);
501		}
502		nr_swap_pages++;
503	}
504	spin_unlock(&swap_lock);
505	return (swp_entry_t) {0};
506}
507
508static struct swap_info_struct *swap_info_get(swp_entry_t entry)
509{
510	struct swap_info_struct *p;
511	unsigned long offset, type;
512
513	if (!entry.val)
514		goto out;
515	type = swp_type(entry);
516	if (type >= nr_swapfiles)
517		goto bad_nofile;
518	p = swap_info[type];
519	if (!(p->flags & SWP_USED))
520		goto bad_device;
521	offset = swp_offset(entry);
522	if (offset >= p->max)
523		goto bad_offset;
524	if (!p->swap_map[offset])
525		goto bad_free;
526	spin_lock(&swap_lock);
527	return p;
528
529bad_free:
530	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
531	goto out;
532bad_offset:
533	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
534	goto out;
535bad_device:
536	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
537	goto out;
538bad_nofile:
539	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
540out:
541	return NULL;
542}
543
544static unsigned char swap_entry_free(struct swap_info_struct *p,
545				     swp_entry_t entry, unsigned char usage)
546{
547	unsigned long offset = swp_offset(entry);
548	unsigned char count;
549	unsigned char has_cache;
550
551	count = p->swap_map[offset];
552	has_cache = count & SWAP_HAS_CACHE;
553	count &= ~SWAP_HAS_CACHE;
554
555	if (usage == SWAP_HAS_CACHE) {
556		VM_BUG_ON(!has_cache);
557		has_cache = 0;
558	} else if (count == SWAP_MAP_SHMEM) {
559		/*
560		 * Or we could insist on shmem.c using a special
561		 * swap_shmem_free() and free_shmem_swap_and_cache()...
562		 */
563		count = 0;
564	} else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
565		if (count == COUNT_CONTINUED) {
566			if (swap_count_continued(p, offset, count))
567				count = SWAP_MAP_MAX | COUNT_CONTINUED;
568			else
569				count = SWAP_MAP_MAX;
570		} else
571			count--;
572	}
573
574	if (!count)
575		mem_cgroup_uncharge_swap(entry);
576
577	usage = count | has_cache;
578	p->swap_map[offset] = usage;
579
580	/* free if no reference */
581	if (!usage) {
582		struct gendisk *disk = p->bdev->bd_disk;
583		if (offset < p->lowest_bit)
584			p->lowest_bit = offset;
585		if (offset > p->highest_bit)
586			p->highest_bit = offset;
587		if (swap_list.next >= 0 &&
588		    p->prio > swap_info[swap_list.next]->prio)
589			swap_list.next = p->type;
590		nr_swap_pages++;
591		p->inuse_pages--;
592		if ((p->flags & SWP_BLKDEV) &&
593				disk->fops->swap_slot_free_notify)
594			disk->fops->swap_slot_free_notify(p->bdev, offset);
595	}
596
597	return usage;
598}
599
600/*
601 * Caller has made sure that the swapdevice corresponding to entry
602 * is still around or has not been recycled.
603 */
604void swap_free(swp_entry_t entry)
605{
606	struct swap_info_struct *p;
607
608	p = swap_info_get(entry);
609	if (p) {
610		swap_entry_free(p, entry, 1);
611		spin_unlock(&swap_lock);
612	}
613}
614
615/*
616 * Called after dropping swapcache to decrease refcnt to swap entries.
617 */
618void swapcache_free(swp_entry_t entry, struct page *page)
619{
620	struct swap_info_struct *p;
621	unsigned char count;
622
623	p = swap_info_get(entry);
624	if (p) {
625		count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
626		if (page)
627			mem_cgroup_uncharge_swapcache(page, entry, count != 0);
628		spin_unlock(&swap_lock);
629	}
630}
631
632/*
633 * How many references to page are currently swapped out?
634 * This does not give an exact answer when swap count is continued,
635 * but does include the high COUNT_CONTINUED flag to allow for that.
636 */
637static inline int page_swapcount(struct page *page)
638{
639	int count = 0;
640	struct swap_info_struct *p;
641	swp_entry_t entry;
642
643	entry.val = page_private(page);
644	p = swap_info_get(entry);
645	if (p) {
646		count = swap_count(p->swap_map[swp_offset(entry)]);
647		spin_unlock(&swap_lock);
648	}
649	return count;
650}
651
652/*
653 * We can write to an anon page without COW if there are no other references
654 * to it.  And as a side-effect, free up its swap: because the old content
655 * on disk will never be read, and seeking back there to write new content
656 * later would only waste time away from clustering.
657 */
658int reuse_swap_page(struct page *page)
659{
660	int count;
661
662	VM_BUG_ON(!PageLocked(page));
663	if (unlikely(PageKsm(page)))
664		return 0;
665	count = page_mapcount(page);
666	if (count <= 1 && PageSwapCache(page)) {
667		count += page_swapcount(page);
668		if (count == 1 && !PageWriteback(page)) {
669			delete_from_swap_cache(page);
670			SetPageDirty(page);
671		}
672	}
673	return count <= 1;
674}
675
676/*
677 * If swap is getting full, or if there are no more mappings of this page,
678 * then try_to_free_swap is called to free its swap space.
679 */
680int try_to_free_swap(struct page *page)
681{
682	VM_BUG_ON(!PageLocked(page));
683
684	if (!PageSwapCache(page))
685		return 0;
686	if (PageWriteback(page))
687		return 0;
688	if (page_swapcount(page))
689		return 0;
690
691	/*
692	 * Once hibernation has begun to create its image of memory,
693	 * there's a danger that one of the calls to try_to_free_swap()
694	 * - most probably a call from __try_to_reclaim_swap() while
695	 * hibernation is allocating its own swap pages for the image,
696	 * but conceivably even a call from memory reclaim - will free
697	 * the swap from a page which has already been recorded in the
698	 * image as a clean swapcache page, and then reuse its swap for
699	 * another page of the image.  On waking from hibernation, the
700	 * original page might be freed under memory pressure, then
701	 * later read back in from swap, now with the wrong data.
702	 *
703	 * Hibernation clears bits from gfp_allowed_mask to prevent
704	 * memory reclaim from writing to disk, so check that here.
705	 */
706	if (!(gfp_allowed_mask & __GFP_IO))
707		return 0;
708
709	delete_from_swap_cache(page);
710	SetPageDirty(page);
711	return 1;
712}
713
714/*
715 * Free the swap entry like above, but also try to
716 * free the page cache entry if it is the last user.
717 */
718int free_swap_and_cache(swp_entry_t entry)
719{
720	struct swap_info_struct *p;
721	struct page *page = NULL;
722
723	if (non_swap_entry(entry))
724		return 1;
725
726	p = swap_info_get(entry);
727	if (p) {
728		if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
729			page = find_get_page(&swapper_space, entry.val);
730			if (page && !trylock_page(page)) {
731				page_cache_release(page);
732				page = NULL;
733			}
734		}
735		spin_unlock(&swap_lock);
736	}
737	if (page) {
738		/*
739		 * Not mapped elsewhere, or swap space full? Free it!
740		 * Also recheck PageSwapCache now page is locked (above).
741		 */
742		if (PageSwapCache(page) && !PageWriteback(page) &&
743				(!page_mapped(page) || vm_swap_full())) {
744			delete_from_swap_cache(page);
745			SetPageDirty(page);
746		}
747		unlock_page(page);
748		page_cache_release(page);
749	}
750	return p != NULL;
751}
752
753#ifdef CONFIG_CGROUP_MEM_RES_CTLR
754/**
755 * mem_cgroup_count_swap_user - count the user of a swap entry
756 * @ent: the swap entry to be checked
757 * @pagep: the pointer for the swap cache page of the entry to be stored
758 *
759 * Returns the number of the user of the swap entry. The number is valid only
760 * for swaps of anonymous pages.
761 * If the entry is found on swap cache, the page is stored to pagep with
762 * refcount of it being incremented.
763 */
764int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
765{
766	struct page *page;
767	struct swap_info_struct *p;
768	int count = 0;
769
770	page = find_get_page(&swapper_space, ent.val);
771	if (page)
772		count += page_mapcount(page);
773	p = swap_info_get(ent);
774	if (p) {
775		count += swap_count(p->swap_map[swp_offset(ent)]);
776		spin_unlock(&swap_lock);
777	}
778
779	*pagep = page;
780	return count;
781}
782#endif
783
784#ifdef CONFIG_HIBERNATION
785/*
786 * Find the swap type that corresponds to given device (if any).
787 *
788 * @offset - number of the PAGE_SIZE-sized block of the device, starting
789 * from 0, in which the swap header is expected to be located.
790 *
791 * This is needed for the suspend to disk (aka swsusp).
792 */
793int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
794{
795	struct block_device *bdev = NULL;
796	int type;
797
798	if (device)
799		bdev = bdget(device);
800
801	spin_lock(&swap_lock);
802	for (type = 0; type < nr_swapfiles; type++) {
803		struct swap_info_struct *sis = swap_info[type];
804
805		if (!(sis->flags & SWP_WRITEOK))
806			continue;
807
808		if (!bdev) {
809			if (bdev_p)
810				*bdev_p = bdgrab(sis->bdev);
811
812			spin_unlock(&swap_lock);
813			return type;
814		}
815		if (bdev == sis->bdev) {
816			struct swap_extent *se = &sis->first_swap_extent;
817
818			if (se->start_block == offset) {
819				if (bdev_p)
820					*bdev_p = bdgrab(sis->bdev);
821
822				spin_unlock(&swap_lock);
823				bdput(bdev);
824				return type;
825			}
826		}
827	}
828	spin_unlock(&swap_lock);
829	if (bdev)
830		bdput(bdev);
831
832	return -ENODEV;
833}
834
835/*
836 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
837 * corresponding to given index in swap_info (swap type).
838 */
839sector_t swapdev_block(int type, pgoff_t offset)
840{
841	struct block_device *bdev;
842
843	if ((unsigned int)type >= nr_swapfiles)
844		return 0;
845	if (!(swap_info[type]->flags & SWP_WRITEOK))
846		return 0;
847	return map_swap_entry(swp_entry(type, offset), &bdev);
848}
849
850/*
851 * Return either the total number of swap pages of given type, or the number
852 * of free pages of that type (depending on @free)
853 *
854 * This is needed for software suspend
855 */
856unsigned int count_swap_pages(int type, int free)
857{
858	unsigned int n = 0;
859
860	spin_lock(&swap_lock);
861	if ((unsigned int)type < nr_swapfiles) {
862		struct swap_info_struct *sis = swap_info[type];
863
864		if (sis->flags & SWP_WRITEOK) {
865			n = sis->pages;
866			if (free)
867				n -= sis->inuse_pages;
868		}
869	}
870	spin_unlock(&swap_lock);
871	return n;
872}
873#endif /* CONFIG_HIBERNATION */
874
875/*
876 * No need to decide whether this PTE shares the swap entry with others,
877 * just let do_wp_page work it out if a write is requested later - to
878 * force COW, vm_page_prot omits write permission from any private vma.
879 */
880static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
881		unsigned long addr, swp_entry_t entry, struct page *page)
882{
883	struct mem_cgroup *ptr = NULL;
884	spinlock_t *ptl;
885	pte_t *pte;
886	int ret = 1;
887
888	if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) {
889		ret = -ENOMEM;
890		goto out_nolock;
891	}
892
893	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
894	if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
895		if (ret > 0)
896			mem_cgroup_cancel_charge_swapin(ptr);
897		ret = 0;
898		goto out;
899	}
900
901	dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
902	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
903	get_page(page);
904	set_pte_at(vma->vm_mm, addr, pte,
905		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
906	page_add_anon_rmap(page, vma, addr);
907	mem_cgroup_commit_charge_swapin(page, ptr);
908	swap_free(entry);
909	/*
910	 * Move the page to the active list so it is not
911	 * immediately swapped out again after swapon.
912	 */
913	activate_page(page);
914out:
915	pte_unmap_unlock(pte, ptl);
916out_nolock:
917	return ret;
918}
919
920static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
921				unsigned long addr, unsigned long end,
922				swp_entry_t entry, struct page *page)
923{
924	pte_t swp_pte = swp_entry_to_pte(entry);
925	pte_t *pte;
926	int ret = 0;
927
928	/*
929	 * We don't actually need pte lock while scanning for swp_pte: since
930	 * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
931	 * page table while we're scanning; though it could get zapped, and on
932	 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
933	 * of unmatched parts which look like swp_pte, so unuse_pte must
934	 * recheck under pte lock.  Scanning without pte lock lets it be
935	 * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
936	 */
937	pte = pte_offset_map(pmd, addr);
938	do {
939		/*
940		 * swapoff spends a _lot_ of time in this loop!
941		 * Test inline before going to call unuse_pte.
942		 */
943		if (unlikely(pte_same(*pte, swp_pte))) {
944			pte_unmap(pte);
945			ret = unuse_pte(vma, pmd, addr, entry, page);
946			if (ret)
947				goto out;
948			pte = pte_offset_map(pmd, addr);
949		}
950	} while (pte++, addr += PAGE_SIZE, addr != end);
951	pte_unmap(pte - 1);
952out:
953	return ret;
954}
955
956static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
957				unsigned long addr, unsigned long end,
958				swp_entry_t entry, struct page *page)
959{
960	pmd_t *pmd;
961	unsigned long next;
962	int ret;
963
964	pmd = pmd_offset(pud, addr);
965	do {
966		next = pmd_addr_end(addr, end);
967		if (unlikely(pmd_trans_huge(*pmd)))
968			continue;
969		if (pmd_none_or_clear_bad(pmd))
970			continue;
971		ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
972		if (ret)
973			return ret;
974	} while (pmd++, addr = next, addr != end);
975	return 0;
976}
977
978static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
979				unsigned long addr, unsigned long end,
980				swp_entry_t entry, struct page *page)
981{
982	pud_t *pud;
983	unsigned long next;
984	int ret;
985
986	pud = pud_offset(pgd, addr);
987	do {
988		next = pud_addr_end(addr, end);
989		if (pud_none_or_clear_bad(pud))
990			continue;
991		ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
992		if (ret)
993			return ret;
994	} while (pud++, addr = next, addr != end);
995	return 0;
996}
997
998static int unuse_vma(struct vm_area_struct *vma,
999				swp_entry_t entry, struct page *page)
1000{
1001	pgd_t *pgd;
1002	unsigned long addr, end, next;
1003	int ret;
1004
1005	if (page_anon_vma(page)) {
1006		addr = page_address_in_vma(page, vma);
1007		if (addr == -EFAULT)
1008			return 0;
1009		else
1010			end = addr + PAGE_SIZE;
1011	} else {
1012		addr = vma->vm_start;
1013		end = vma->vm_end;
1014	}
1015
1016	pgd = pgd_offset(vma->vm_mm, addr);
1017	do {
1018		next = pgd_addr_end(addr, end);
1019		if (pgd_none_or_clear_bad(pgd))
1020			continue;
1021		ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
1022		if (ret)
1023			return ret;
1024	} while (pgd++, addr = next, addr != end);
1025	return 0;
1026}
1027
1028static int unuse_mm(struct mm_struct *mm,
1029				swp_entry_t entry, struct page *page)
1030{
1031	struct vm_area_struct *vma;
1032	int ret = 0;
1033
1034	if (!down_read_trylock(&mm->mmap_sem)) {
1035		/*
1036		 * Activate page so shrink_inactive_list is unlikely to unmap
1037		 * its ptes while lock is dropped, so swapoff can make progress.
1038		 */
1039		activate_page(page);
1040		unlock_page(page);
1041		down_read(&mm->mmap_sem);
1042		lock_page(page);
1043	}
1044	for (vma = mm->mmap; vma; vma = vma->vm_next) {
1045		if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
1046			break;
1047	}
1048	up_read(&mm->mmap_sem);
1049	return (ret < 0)? ret: 0;
1050}
1051
1052/*
1053 * Scan swap_map from current position to next entry still in use.
1054 * Recycle to start on reaching the end, returning 0 when empty.
1055 */
1056static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1057					unsigned int prev)
1058{
1059	unsigned int max = si->max;
1060	unsigned int i = prev;
1061	unsigned char count;
1062
1063	/*
1064	 * No need for swap_lock here: we're just looking
1065	 * for whether an entry is in use, not modifying it; false
1066	 * hits are okay, and sys_swapoff() has already prevented new
1067	 * allocations from this area (while holding swap_lock).
1068	 */
1069	for (;;) {
1070		if (++i >= max) {
1071			if (!prev) {
1072				i = 0;
1073				break;
1074			}
1075			/*
1076			 * No entries in use at top of swap_map,
1077			 * loop back to start and recheck there.
1078			 */
1079			max = prev + 1;
1080			prev = 0;
1081			i = 1;
1082		}
1083		count = si->swap_map[i];
1084		if (count && swap_count(count) != SWAP_MAP_BAD)
1085			break;
1086	}
1087	return i;
1088}
1089
1090/*
1091 * We completely avoid races by reading each swap page in advance,
1092 * and then search for the process using it.  All the necessary
1093 * page table adjustments can then be made atomically.
1094 */
1095static int try_to_unuse(unsigned int type)
1096{
1097	struct swap_info_struct *si = swap_info[type];
1098	struct mm_struct *start_mm;
1099	unsigned char *swap_map;
1100	unsigned char swcount;
1101	struct page *page;
1102	swp_entry_t entry;
1103	unsigned int i = 0;
1104	int retval = 0;
1105
1106	/*
1107	 * When searching mms for an entry, a good strategy is to
1108	 * start at the first mm we freed the previous entry from
1109	 * (though actually we don't notice whether we or coincidence
1110	 * freed the entry).  Initialize this start_mm with a hold.
1111	 *
1112	 * A simpler strategy would be to start at the last mm we
1113	 * freed the previous entry from; but that would take less
1114	 * advantage of mmlist ordering, which clusters forked mms
1115	 * together, child after parent.  If we race with dup_mmap(), we
1116	 * prefer to resolve parent before child, lest we miss entries
1117	 * duplicated after we scanned child: using last mm would invert
1118	 * that.
1119	 */
1120	start_mm = &init_mm;
1121	atomic_inc(&init_mm.mm_users);
1122
1123	/*
1124	 * Keep on scanning until all entries have gone.  Usually,
1125	 * one pass through swap_map is enough, but not necessarily:
1126	 * there are races when an instance of an entry might be missed.
1127	 */
1128	while ((i = find_next_to_unuse(si, i)) != 0) {
1129		if (signal_pending(current)) {
1130			retval = -EINTR;
1131			break;
1132		}
1133
1134		/*
1135		 * Get a page for the entry, using the existing swap
1136		 * cache page if there is one.  Otherwise, get a clean
1137		 * page and read the swap into it.
1138		 */
1139		swap_map = &si->swap_map[i];
1140		entry = swp_entry(type, i);
1141		page = read_swap_cache_async(entry,
1142					GFP_HIGHUSER_MOVABLE, NULL, 0);
1143		if (!page) {
1144			/*
1145			 * Either swap_duplicate() failed because entry
1146			 * has been freed independently, and will not be
1147			 * reused since sys_swapoff() already disabled
1148			 * allocation from here, or alloc_page() failed.
1149			 */
1150			if (!*swap_map)
1151				continue;
1152			retval = -ENOMEM;
1153			break;
1154		}
1155
1156		/*
1157		 * Don't hold on to start_mm if it looks like exiting.
1158		 */
1159		if (atomic_read(&start_mm->mm_users) == 1) {
1160			mmput(start_mm);
1161			start_mm = &init_mm;
1162			atomic_inc(&init_mm.mm_users);
1163		}
1164
1165		/*
1166		 * Wait for and lock page.  When do_swap_page races with
1167		 * try_to_unuse, do_swap_page can handle the fault much
1168		 * faster than try_to_unuse can locate the entry.  This
1169		 * apparently redundant "wait_on_page_locked" lets try_to_unuse
1170		 * defer to do_swap_page in such a case - in some tests,
1171		 * do_swap_page and try_to_unuse repeatedly compete.
1172		 */
1173		wait_on_page_locked(page);
1174		wait_on_page_writeback(page);
1175		lock_page(page);
1176		wait_on_page_writeback(page);
1177
1178		/*
1179		 * Remove all references to entry.
1180		 */
1181		swcount = *swap_map;
1182		if (swap_count(swcount) == SWAP_MAP_SHMEM) {
1183			retval = shmem_unuse(entry, page);
1184			/* page has already been unlocked and released */
1185			if (retval < 0)
1186				break;
1187			continue;
1188		}
1189		if (swap_count(swcount) && start_mm != &init_mm)
1190			retval = unuse_mm(start_mm, entry, page);
1191
1192		if (swap_count(*swap_map)) {
1193			int set_start_mm = (*swap_map >= swcount);
1194			struct list_head *p = &start_mm->mmlist;
1195			struct mm_struct *new_start_mm = start_mm;
1196			struct mm_struct *prev_mm = start_mm;
1197			struct mm_struct *mm;
1198
1199			atomic_inc(&new_start_mm->mm_users);
1200			atomic_inc(&prev_mm->mm_users);
1201			spin_lock(&mmlist_lock);
1202			while (swap_count(*swap_map) && !retval &&
1203					(p = p->next) != &start_mm->mmlist) {
1204				mm = list_entry(p, struct mm_struct, mmlist);
1205				if (!atomic_inc_not_zero(&mm->mm_users))
1206					continue;
1207				spin_unlock(&mmlist_lock);
1208				mmput(prev_mm);
1209				prev_mm = mm;
1210
1211				cond_resched();
1212
1213				swcount = *swap_map;
1214				if (!swap_count(swcount)) /* any usage ? */
1215					;
1216				else if (mm == &init_mm)
1217					set_start_mm = 1;
1218				else
1219					retval = unuse_mm(mm, entry, page);
1220
1221				if (set_start_mm && *swap_map < swcount) {
1222					mmput(new_start_mm);
1223					atomic_inc(&mm->mm_users);
1224					new_start_mm = mm;
1225					set_start_mm = 0;
1226				}
1227				spin_lock(&mmlist_lock);
1228			}
1229			spin_unlock(&mmlist_lock);
1230			mmput(prev_mm);
1231			mmput(start_mm);
1232			start_mm = new_start_mm;
1233		}
1234		if (retval) {
1235			unlock_page(page);
1236			page_cache_release(page);
1237			break;
1238		}
1239
1240		/*
1241		 * If a reference remains (rare), we would like to leave
1242		 * the page in the swap cache; but try_to_unmap could
1243		 * then re-duplicate the entry once we drop page lock,
1244		 * so we might loop indefinitely; also, that page could
1245		 * not be swapped out to other storage meanwhile.  So:
1246		 * delete from cache even if there's another reference,
1247		 * after ensuring that the data has been saved to disk -
1248		 * since if the reference remains (rarer), it will be
1249		 * read from disk into another page.  Splitting into two
1250		 * pages would be incorrect if swap supported "shared
1251		 * private" pages, but they are handled by tmpfs files.
1252		 *
1253		 * Given how unuse_vma() targets one particular offset
1254		 * in an anon_vma, once the anon_vma has been determined,
1255		 * this splitting happens to be just what is needed to
1256		 * handle where KSM pages have been swapped out: re-reading
1257		 * is unnecessarily slow, but we can fix that later on.
1258		 */
1259		if (swap_count(*swap_map) &&
1260		     PageDirty(page) && PageSwapCache(page)) {
1261			struct writeback_control wbc = {
1262				.sync_mode = WB_SYNC_NONE,
1263			};
1264
1265			swap_writepage(page, &wbc);
1266			lock_page(page);
1267			wait_on_page_writeback(page);
1268		}
1269
1270		/*
1271		 * It is conceivable that a racing task removed this page from
1272		 * swap cache just before we acquired the page lock at the top,
1273		 * or while we dropped it in unuse_mm().  The page might even
1274		 * be back in swap cache on another swap area: that we must not
1275		 * delete, since it may not have been written out to swap yet.
1276		 */
1277		if (PageSwapCache(page) &&
1278		    likely(page_private(page) == entry.val))
1279			delete_from_swap_cache(page);
1280
1281		/*
1282		 * So we could skip searching mms once swap count went
1283		 * to 1, we did not mark any present ptes as dirty: must
1284		 * mark page dirty so shrink_page_list will preserve it.
1285		 */
1286		SetPageDirty(page);
1287		unlock_page(page);
1288		page_cache_release(page);
1289
1290		/*
1291		 * Make sure that we aren't completely killing
1292		 * interactive performance.
1293		 */
1294		cond_resched();
1295	}
1296
1297	mmput(start_mm);
1298	return retval;
1299}
1300
1301/*
1302 * After a successful try_to_unuse, if no swap is now in use, we know
1303 * we can empty the mmlist.  swap_lock must be held on entry and exit.
1304 * Note that mmlist_lock nests inside swap_lock, and an mm must be
1305 * added to the mmlist just after page_duplicate - before would be racy.
1306 */
1307static void drain_mmlist(void)
1308{
1309	struct list_head *p, *next;
1310	unsigned int type;
1311
1312	for (type = 0; type < nr_swapfiles; type++)
1313		if (swap_info[type]->inuse_pages)
1314			return;
1315	spin_lock(&mmlist_lock);
1316	list_for_each_safe(p, next, &init_mm.mmlist)
1317		list_del_init(p);
1318	spin_unlock(&mmlist_lock);
1319}
1320
1321/*
1322 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
1323 * corresponds to page offset for the specified swap entry.
1324 * Note that the type of this function is sector_t, but it returns page offset
1325 * into the bdev, not sector offset.
1326 */
1327static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
1328{
1329	struct swap_info_struct *sis;
1330	struct swap_extent *start_se;
1331	struct swap_extent *se;
1332	pgoff_t offset;
1333
1334	sis = swap_info[swp_type(entry)];
1335	*bdev = sis->bdev;
1336
1337	offset = swp_offset(entry);
1338	start_se = sis->curr_swap_extent;
1339	se = start_se;
1340
1341	for ( ; ; ) {
1342		struct list_head *lh;
1343
1344		if (se->start_page <= offset &&
1345				offset < (se->start_page + se->nr_pages)) {
1346			return se->start_block + (offset - se->start_page);
1347		}
1348		lh = se->list.next;
1349		se = list_entry(lh, struct swap_extent, list);
1350		sis->curr_swap_extent = se;
1351		BUG_ON(se == start_se);		/* It *must* be present */
1352	}
1353}
1354
1355/*
1356 * Returns the page offset into bdev for the specified page's swap entry.
1357 */
1358sector_t map_swap_page(struct page *page, struct block_device **bdev)
1359{
1360	swp_entry_t entry;
1361	entry.val = page_private(page);
1362	return map_swap_entry(entry, bdev);
1363}
1364
1365/*
1366 * Free all of a swapdev's extent information
1367 */
1368static void destroy_swap_extents(struct swap_info_struct *sis)
1369{
1370	while (!list_empty(&sis->first_swap_extent.list)) {
1371		struct swap_extent *se;
1372
1373		se = list_entry(sis->first_swap_extent.list.next,
1374				struct swap_extent, list);
1375		list_del(&se->list);
1376		kfree(se);
1377	}
1378}
1379
1380/*
1381 * Add a block range (and the corresponding page range) into this swapdev's
1382 * extent list.  The extent list is kept sorted in page order.
1383 *
1384 * This function rather assumes that it is called in ascending page order.
1385 */
1386static int
1387add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1388		unsigned long nr_pages, sector_t start_block)
1389{
1390	struct swap_extent *se;
1391	struct swap_extent *new_se;
1392	struct list_head *lh;
1393
1394	if (start_page == 0) {
1395		se = &sis->first_swap_extent;
1396		sis->curr_swap_extent = se;
1397		se->start_page = 0;
1398		se->nr_pages = nr_pages;
1399		se->start_block = start_block;
1400		return 1;
1401	} else {
1402		lh = sis->first_swap_extent.list.prev;	/* Highest extent */
1403		se = list_entry(lh, struct swap_extent, list);
1404		BUG_ON(se->start_page + se->nr_pages != start_page);
1405		if (se->start_block + se->nr_pages == start_block) {
1406			/* Merge it */
1407			se->nr_pages += nr_pages;
1408			return 0;
1409		}
1410	}
1411
1412	/*
1413	 * No merge.  Insert a new extent, preserving ordering.
1414	 */
1415	new_se = kmalloc(sizeof(*se), GFP_KERNEL);
1416	if (new_se == NULL)
1417		return -ENOMEM;
1418	new_se->start_page = start_page;
1419	new_se->nr_pages = nr_pages;
1420	new_se->start_block = start_block;
1421
1422	list_add_tail(&new_se->list, &sis->first_swap_extent.list);
1423	return 1;
1424}
1425
1426/*
1427 * A `swap extent' is a simple thing which maps a contiguous range of pages
1428 * onto a contiguous range of disk blocks.  An ordered list of swap extents
1429 * is built at swapon time and is then used at swap_writepage/swap_readpage
1430 * time for locating where on disk a page belongs.
1431 *
1432 * If the swapfile is an S_ISBLK block device, a single extent is installed.
1433 * This is done so that the main operating code can treat S_ISBLK and S_ISREG
1434 * swap files identically.
1435 *
1436 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
1437 * extent list operates in PAGE_SIZE disk blocks.  Both S_ISREG and S_ISBLK
1438 * swapfiles are handled *identically* after swapon time.
1439 *
1440 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
1441 * and will parse them into an ordered extent list, in PAGE_SIZE chunks.  If
1442 * some stray blocks are found which do not fall within the PAGE_SIZE alignment
1443 * requirements, they are simply tossed out - we will never use those blocks
1444 * for swapping.
1445 *
1446 * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon.  This
1447 * prevents root from shooting her foot off by ftruncating an in-use swapfile,
1448 * which will scribble on the fs.
1449 *
1450 * The amount of disk space which a single swap extent represents varies.
1451 * Typically it is in the 1-4 megabyte range.  So we can have hundreds of
1452 * extents in the list.  To avoid much list walking, we cache the previous
1453 * search location in `curr_swap_extent', and start new searches from there.
1454 * This is extremely effective.  The average number of iterations in
1455 * map_swap_page() has been measured at about 0.3 per page.  - akpm.
1456 */
1457static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1458{
1459	struct inode *inode;
1460	unsigned blocks_per_page;
1461	unsigned long page_no;
1462	unsigned blkbits;
1463	sector_t probe_block;
1464	sector_t last_block;
1465	sector_t lowest_block = -1;
1466	sector_t highest_block = 0;
1467	int nr_extents = 0;
1468	int ret;
1469
1470	inode = sis->swap_file->f_mapping->host;
1471	if (S_ISBLK(inode->i_mode)) {
1472		ret = add_swap_extent(sis, 0, sis->max, 0);
1473		*span = sis->pages;
1474		goto out;
1475	}
1476
1477	blkbits = inode->i_blkbits;
1478	blocks_per_page = PAGE_SIZE >> blkbits;
1479
1480	/*
1481	 * Map all the blocks into the extent list.  This code doesn't try
1482	 * to be very smart.
1483	 */
1484	probe_block = 0;
1485	page_no = 0;
1486	last_block = i_size_read(inode) >> blkbits;
1487	while ((probe_block + blocks_per_page) <= last_block &&
1488			page_no < sis->max) {
1489		unsigned block_in_page;
1490		sector_t first_block;
1491
1492		first_block = bmap(inode, probe_block);
1493		if (first_block == 0)
1494			goto bad_bmap;
1495
1496		/*
1497		 * It must be PAGE_SIZE aligned on-disk
1498		 */
1499		if (first_block & (blocks_per_page - 1)) {
1500			probe_block++;
1501			goto reprobe;
1502		}
1503
1504		for (block_in_page = 1; block_in_page < blocks_per_page;
1505					block_in_page++) {
1506			sector_t block;
1507
1508			block = bmap(inode, probe_block + block_in_page);
1509			if (block == 0)
1510				goto bad_bmap;
1511			if (block != first_block + block_in_page) {
1512				/* Discontiguity */
1513				probe_block++;
1514				goto reprobe;
1515			}
1516		}
1517
1518		first_block >>= (PAGE_SHIFT - blkbits);
1519		if (page_no) {	/* exclude the header page */
1520			if (first_block < lowest_block)
1521				lowest_block = first_block;
1522			if (first_block > highest_block)
1523				highest_block = first_block;
1524		}
1525
1526		/*
1527		 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
1528		 */
1529		ret = add_swap_extent(sis, page_no, 1, first_block);
1530		if (ret < 0)
1531			goto out;
1532		nr_extents += ret;
1533		page_no++;
1534		probe_block += blocks_per_page;
1535reprobe:
1536		continue;
1537	}
1538	ret = nr_extents;
1539	*span = 1 + highest_block - lowest_block;
1540	if (page_no == 0)
1541		page_no = 1;	/* force Empty message */
1542	sis->max = page_no;
1543	sis->pages = page_no - 1;
1544	sis->highest_bit = page_no - 1;
1545out:
1546	return ret;
1547bad_bmap:
1548	printk(KERN_ERR "swapon: swapfile has holes\n");
1549	ret = -EINVAL;
1550	goto out;
1551}
1552
1553SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1554{
1555	struct swap_info_struct *p = NULL;
1556	unsigned char *swap_map;
1557	struct file *swap_file, *victim;
1558	struct address_space *mapping;
1559	struct inode *inode;
1560	char *pathname;
1561	int i, type, prev;
1562	int err;
1563
1564	if (!capable(CAP_SYS_ADMIN))
1565		return -EPERM;
1566
1567	pathname = getname(specialfile);
1568	err = PTR_ERR(pathname);
1569	if (IS_ERR(pathname))
1570		goto out;
1571
1572	victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
1573	putname(pathname);
1574	err = PTR_ERR(victim);
1575	if (IS_ERR(victim))
1576		goto out;
1577
1578	mapping = victim->f_mapping;
1579	prev = -1;
1580	spin_lock(&swap_lock);
1581	for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
1582		p = swap_info[type];
1583		if (p->flags & SWP_WRITEOK) {
1584			if (p->swap_file->f_mapping == mapping)
1585				break;
1586		}
1587		prev = type;
1588	}
1589	if (type < 0) {
1590		err = -EINVAL;
1591		spin_unlock(&swap_lock);
1592		goto out_dput;
1593	}
1594	if (!security_vm_enough_memory(p->pages))
1595		vm_unacct_memory(p->pages);
1596	else {
1597		err = -ENOMEM;
1598		spin_unlock(&swap_lock);
1599		goto out_dput;
1600	}
1601	if (prev < 0)
1602		swap_list.head = p->next;
1603	else
1604		swap_info[prev]->next = p->next;
1605	if (type == swap_list.next) {
1606		/* just pick something that's safe... */
1607		swap_list.next = swap_list.head;
1608	}
1609	if (p->prio < 0) {
1610		for (i = p->next; i >= 0; i = swap_info[i]->next)
1611			swap_info[i]->prio = p->prio--;
1612		least_priority++;
1613	}
1614	nr_swap_pages -= p->pages;
1615	total_swap_pages -= p->pages;
1616	p->flags &= ~SWP_WRITEOK;
1617	spin_unlock(&swap_lock);
1618
1619	current->flags |= PF_OOM_ORIGIN;
1620	err = try_to_unuse(type);
1621	current->flags &= ~PF_OOM_ORIGIN;
1622
1623	if (err) {
1624		/* re-insert swap space back into swap_list */
1625		spin_lock(&swap_lock);
1626		if (p->prio < 0)
1627			p->prio = --least_priority;
1628		prev = -1;
1629		for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1630			if (p->prio >= swap_info[i]->prio)
1631				break;
1632			prev = i;
1633		}
1634		p->next = i;
1635		if (prev < 0)
1636			swap_list.head = swap_list.next = type;
1637		else
1638			swap_info[prev]->next = type;
1639		nr_swap_pages += p->pages;
1640		total_swap_pages += p->pages;
1641		p->flags |= SWP_WRITEOK;
1642		spin_unlock(&swap_lock);
1643		goto out_dput;
1644	}
1645
1646	/* wait for any unplug function to finish */
1647	down_write(&swap_unplug_sem);
1648	up_write(&swap_unplug_sem);
1649
1650	destroy_swap_extents(p);
1651	if (p->flags & SWP_CONTINUED)
1652		free_swap_count_continuations(p);
1653
1654	mutex_lock(&swapon_mutex);
1655	spin_lock(&swap_lock);
1656	drain_mmlist();
1657
1658	/* wait for anyone still in scan_swap_map */
1659	p->highest_bit = 0;		/* cuts scans short */
1660	while (p->flags >= SWP_SCANNING) {
1661		spin_unlock(&swap_lock);
1662		schedule_timeout_uninterruptible(1);
1663		spin_lock(&swap_lock);
1664	}
1665
1666	swap_file = p->swap_file;
1667	p->swap_file = NULL;
1668	p->max = 0;
1669	swap_map = p->swap_map;
1670	p->swap_map = NULL;
1671	p->flags = 0;
1672	spin_unlock(&swap_lock);
1673	mutex_unlock(&swapon_mutex);
1674	vfree(swap_map);
1675	/* Destroy swap account informatin */
1676	swap_cgroup_swapoff(type);
1677
1678	inode = mapping->host;
1679	if (S_ISBLK(inode->i_mode)) {
1680		struct block_device *bdev = I_BDEV(inode);
1681		set_blocksize(bdev, p->old_block_size);
1682		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1683	} else {
1684		mutex_lock(&inode->i_mutex);
1685		inode->i_flags &= ~S_SWAPFILE;
1686		mutex_unlock(&inode->i_mutex);
1687	}
1688	filp_close(swap_file, NULL);
1689	err = 0;
1690	atomic_inc(&proc_poll_event);
1691	wake_up_interruptible(&proc_poll_wait);
1692
1693out_dput:
1694	filp_close(victim, NULL);
1695out:
1696	return err;
1697}
1698
1699#ifdef CONFIG_PROC_FS
1700struct proc_swaps {
1701	struct seq_file seq;
1702	int event;
1703};
1704
1705static unsigned swaps_poll(struct file *file, poll_table *wait)
1706{
1707	struct proc_swaps *s = file->private_data;
1708
1709	poll_wait(file, &proc_poll_wait, wait);
1710
1711	if (s->event != atomic_read(&proc_poll_event)) {
1712		s->event = atomic_read(&proc_poll_event);
1713		return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
1714	}
1715
1716	return POLLIN | POLLRDNORM;
1717}
1718
1719/* iterator */
1720static void *swap_start(struct seq_file *swap, loff_t *pos)
1721{
1722	struct swap_info_struct *si;
1723	int type;
1724	loff_t l = *pos;
1725
1726	mutex_lock(&swapon_mutex);
1727
1728	if (!l)
1729		return SEQ_START_TOKEN;
1730
1731	for (type = 0; type < nr_swapfiles; type++) {
1732		smp_rmb();	/* read nr_swapfiles before swap_info[type] */
1733		si = swap_info[type];
1734		if (!(si->flags & SWP_USED) || !si->swap_map)
1735			continue;
1736		if (!--l)
1737			return si;
1738	}
1739
1740	return NULL;
1741}
1742
1743static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1744{
1745	struct swap_info_struct *si = v;
1746	int type;
1747
1748	if (v == SEQ_START_TOKEN)
1749		type = 0;
1750	else
1751		type = si->type + 1;
1752
1753	for (; type < nr_swapfiles; type++) {
1754		smp_rmb();	/* read nr_swapfiles before swap_info[type] */
1755		si = swap_info[type];
1756		if (!(si->flags & SWP_USED) || !si->swap_map)
1757			continue;
1758		++*pos;
1759		return si;
1760	}
1761
1762	return NULL;
1763}
1764
1765static void swap_stop(struct seq_file *swap, void *v)
1766{
1767	mutex_unlock(&swapon_mutex);
1768}
1769
1770static int swap_show(struct seq_file *swap, void *v)
1771{
1772	struct swap_info_struct *si = v;
1773	struct file *file;
1774	int len;
1775
1776	if (si == SEQ_START_TOKEN) {
1777		seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1778		return 0;
1779	}
1780
1781	file = si->swap_file;
1782	len = seq_path(swap, &file->f_path, " \t\n\\");
1783	seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1784			len < 40 ? 40 - len : 1, " ",
1785			S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1786				"partition" : "file\t",
1787			si->pages << (PAGE_SHIFT - 10),
1788			si->inuse_pages << (PAGE_SHIFT - 10),
1789			si->prio);
1790	return 0;
1791}
1792
1793static const struct seq_operations swaps_op = {
1794	.start =	swap_start,
1795	.next =		swap_next,
1796	.stop =		swap_stop,
1797	.show =		swap_show
1798};
1799
1800static int swaps_open(struct inode *inode, struct file *file)
1801{
1802	struct proc_swaps *s;
1803	int ret;
1804
1805	s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
1806	if (!s)
1807		return -ENOMEM;
1808
1809	file->private_data = s;
1810
1811	ret = seq_open(file, &swaps_op);
1812	if (ret) {
1813		kfree(s);
1814		return ret;
1815	}
1816
1817	s->seq.private = s;
1818	s->event = atomic_read(&proc_poll_event);
1819	return ret;
1820}
1821
1822static const struct file_operations proc_swaps_operations = {
1823	.open		= swaps_open,
1824	.read		= seq_read,
1825	.llseek		= seq_lseek,
1826	.release	= seq_release,
1827	.poll		= swaps_poll,
1828};
1829
1830static int __init procswaps_init(void)
1831{
1832	proc_create("swaps", 0, NULL, &proc_swaps_operations);
1833	return 0;
1834}
1835__initcall(procswaps_init);
1836#endif /* CONFIG_PROC_FS */
1837
1838#ifdef MAX_SWAPFILES_CHECK
1839static int __init max_swapfiles_check(void)
1840{
1841	MAX_SWAPFILES_CHECK();
1842	return 0;
1843}
1844late_initcall(max_swapfiles_check);
1845#endif
1846
1847/*
1848 * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
1849 *
1850 * The swapon system call
1851 */
1852SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1853{
1854	struct swap_info_struct *p;
1855	char *name = NULL;
1856	struct block_device *bdev = NULL;
1857	struct file *swap_file = NULL;
1858	struct address_space *mapping;
1859	unsigned int type;
1860	int i, prev;
1861	int error;
1862	union swap_header *swap_header;
1863	unsigned int nr_good_pages;
1864	int nr_extents = 0;
1865	sector_t span;
1866	unsigned long maxpages;
1867	unsigned long swapfilepages;
1868	unsigned char *swap_map = NULL;
1869	struct page *page = NULL;
1870	struct inode *inode = NULL;
1871	int did_down = 0;
1872
1873	if (!capable(CAP_SYS_ADMIN))
1874		return -EPERM;
1875
1876	p = kzalloc(sizeof(*p), GFP_KERNEL);
1877	if (!p)
1878		return -ENOMEM;
1879
1880	spin_lock(&swap_lock);
1881	for (type = 0; type < nr_swapfiles; type++) {
1882		if (!(swap_info[type]->flags & SWP_USED))
1883			break;
1884	}
1885	error = -EPERM;
1886	if (type >= MAX_SWAPFILES) {
1887		spin_unlock(&swap_lock);
1888		kfree(p);
1889		goto out;
1890	}
1891	if (type >= nr_swapfiles) {
1892		p->type = type;
1893		swap_info[type] = p;
1894		/*
1895		 * Write swap_info[type] before nr_swapfiles, in case a
1896		 * racing procfs swap_start() or swap_next() is reading them.
1897		 * (We never shrink nr_swapfiles, we never free this entry.)
1898		 */
1899		smp_wmb();
1900		nr_swapfiles++;
1901	} else {
1902		kfree(p);
1903		p = swap_info[type];
1904		/*
1905		 * Do not memset this entry: a racing procfs swap_next()
1906		 * would be relying on p->type to remain valid.
1907		 */
1908	}
1909	INIT_LIST_HEAD(&p->first_swap_extent.list);
1910	p->flags = SWP_USED;
1911	p->next = -1;
1912	spin_unlock(&swap_lock);
1913
1914	name = getname(specialfile);
1915	error = PTR_ERR(name);
1916	if (IS_ERR(name)) {
1917		name = NULL;
1918		goto bad_swap_2;
1919	}
1920	swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
1921	error = PTR_ERR(swap_file);
1922	if (IS_ERR(swap_file)) {
1923		swap_file = NULL;
1924		goto bad_swap_2;
1925	}
1926
1927	p->swap_file = swap_file;
1928	mapping = swap_file->f_mapping;
1929	inode = mapping->host;
1930
1931	error = -EBUSY;
1932	for (i = 0; i < nr_swapfiles; i++) {
1933		struct swap_info_struct *q = swap_info[i];
1934
1935		if (i == type || !q->swap_file)
1936			continue;
1937		if (mapping == q->swap_file->f_mapping)
1938			goto bad_swap;
1939	}
1940
1941	error = -EINVAL;
1942	if (S_ISBLK(inode->i_mode)) {
1943		bdev = bdgrab(I_BDEV(inode));
1944		error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1945				   sys_swapon);
1946		if (error < 0) {
1947			bdev = NULL;
1948			error = -EINVAL;
1949			goto bad_swap;
1950		}
1951		p->old_block_size = block_size(bdev);
1952		error = set_blocksize(bdev, PAGE_SIZE);
1953		if (error < 0)
1954			goto bad_swap;
1955		p->bdev = bdev;
1956		p->flags |= SWP_BLKDEV;
1957	} else if (S_ISREG(inode->i_mode)) {
1958		p->bdev = inode->i_sb->s_bdev;
1959		mutex_lock(&inode->i_mutex);
1960		did_down = 1;
1961		if (IS_SWAPFILE(inode)) {
1962			error = -EBUSY;
1963			goto bad_swap;
1964		}
1965	} else {
1966		goto bad_swap;
1967	}
1968
1969	swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1970
1971	/*
1972	 * Read the swap header.
1973	 */
1974	if (!mapping->a_ops->readpage) {
1975		error = -EINVAL;
1976		goto bad_swap;
1977	}
1978	page = read_mapping_page(mapping, 0, swap_file);
1979	if (IS_ERR(page)) {
1980		error = PTR_ERR(page);
1981		goto bad_swap;
1982	}
1983	swap_header = kmap(page);
1984
1985	if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1986		printk(KERN_ERR "Unable to find swap-space signature\n");
1987		error = -EINVAL;
1988		goto bad_swap;
1989	}
1990
1991	/* swap partition endianess hack... */
1992	if (swab32(swap_header->info.version) == 1) {
1993		swab32s(&swap_header->info.version);
1994		swab32s(&swap_header->info.last_page);
1995		swab32s(&swap_header->info.nr_badpages);
1996		for (i = 0; i < swap_header->info.nr_badpages; i++)
1997			swab32s(&swap_header->info.badpages[i]);
1998	}
1999	/* Check the swap header's sub-version */
2000	if (swap_header->info.version != 1) {
2001		printk(KERN_WARNING
2002		       "Unable to handle swap header version %d\n",
2003		       swap_header->info.version);
2004		error = -EINVAL;
2005		goto bad_swap;
2006	}
2007
2008	p->lowest_bit  = 1;
2009	p->cluster_next = 1;
2010	p->cluster_nr = 0;
2011
2012	/*
2013	 * Find out how many pages are allowed for a single swap
2014	 * device. There are two limiting factors: 1) the number of
2015	 * bits for the swap offset in the swp_entry_t type and
2016	 * 2) the number of bits in the a swap pte as defined by
2017	 * the different architectures. In order to find the
2018	 * largest possible bit mask a swap entry with swap type 0
2019	 * and swap offset ~0UL is created, encoded to a swap pte,
2020	 * decoded to a swp_entry_t again and finally the swap
2021	 * offset is extracted. This will mask all the bits from
2022	 * the initial ~0UL mask that can't be encoded in either
2023	 * the swp_entry_t or the architecture definition of a
2024	 * swap pte.
2025	 */
2026	maxpages = swp_offset(pte_to_swp_entry(
2027			swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
2028	if (maxpages > swap_header->info.last_page) {
2029		maxpages = swap_header->info.last_page + 1;
2030		/* p->max is an unsigned int: don't overflow it */
2031		if ((unsigned int)maxpages == 0)
2032			maxpages = UINT_MAX;
2033	}
2034	p->highest_bit = maxpages - 1;
2035
2036	error = -EINVAL;
2037	if (!maxpages)
2038		goto bad_swap;
2039	if (swapfilepages && maxpages > swapfilepages) {
2040		printk(KERN_WARNING
2041		       "Swap area shorter than signature indicates\n");
2042		goto bad_swap;
2043	}
2044	if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
2045		goto bad_swap;
2046	if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2047		goto bad_swap;
2048
2049	/* OK, set up the swap map and apply the bad block list */
2050	swap_map = vmalloc(maxpages);
2051	if (!swap_map) {
2052		error = -ENOMEM;
2053		goto bad_swap;
2054	}
2055
2056	memset(swap_map, 0, maxpages);
2057	nr_good_pages = maxpages - 1;	/* omit header page */
2058
2059	for (i = 0; i < swap_header->info.nr_badpages; i++) {
2060		unsigned int page_nr = swap_header->info.badpages[i];
2061		if (page_nr == 0 || page_nr > swap_header->info.last_page) {
2062			error = -EINVAL;
2063			goto bad_swap;
2064		}
2065		if (page_nr < maxpages) {
2066			swap_map[page_nr] = SWAP_MAP_BAD;
2067			nr_good_pages--;
2068		}
2069	}
2070
2071	error = swap_cgroup_swapon(type, maxpages);
2072	if (error)
2073		goto bad_swap;
2074
2075	if (nr_good_pages) {
2076		swap_map[0] = SWAP_MAP_BAD;
2077		p->max = maxpages;
2078		p->pages = nr_good_pages;
2079		nr_extents = setup_swap_extents(p, &span);
2080		if (nr_extents < 0) {
2081			error = nr_extents;
2082			goto bad_swap;
2083		}
2084		nr_good_pages = p->pages;
2085	}
2086	if (!nr_good_pages) {
2087		printk(KERN_WARNING "Empty swap-file\n");
2088		error = -EINVAL;
2089		goto bad_swap;
2090	}
2091
2092	if (p->bdev) {
2093		if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
2094			p->flags |= SWP_SOLIDSTATE;
2095			p->cluster_next = 1 + (random32() % p->highest_bit);
2096		}
2097		if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD))
2098			p->flags |= SWP_DISCARDABLE;
2099	}
2100
2101	mutex_lock(&swapon_mutex);
2102	spin_lock(&swap_lock);
2103	if (swap_flags & SWAP_FLAG_PREFER)
2104		p->prio =
2105		  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2106	else
2107		p->prio = --least_priority;
2108	p->swap_map = swap_map;
2109	p->flags |= SWP_WRITEOK;
2110	nr_swap_pages += nr_good_pages;
2111	total_swap_pages += nr_good_pages;
2112
2113	printk(KERN_INFO "Adding %uk swap on %s.  "
2114			"Priority:%d extents:%d across:%lluk %s%s\n",
2115		nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
2116		nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2117		(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2118		(p->flags & SWP_DISCARDABLE) ? "D" : "");
2119
2120	/* insert swap space into swap_list: */
2121	prev = -1;
2122	for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
2123		if (p->prio >= swap_info[i]->prio)
2124			break;
2125		prev = i;
2126	}
2127	p->next = i;
2128	if (prev < 0)
2129		swap_list.head = swap_list.next = type;
2130	else
2131		swap_info[prev]->next = type;
2132	spin_unlock(&swap_lock);
2133	mutex_unlock(&swapon_mutex);
2134	atomic_inc(&proc_poll_event);
2135	wake_up_interruptible(&proc_poll_wait);
2136
2137	error = 0;
2138	goto out;
2139bad_swap:
2140	if (bdev) {
2141		set_blocksize(bdev, p->old_block_size);
2142		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2143	}
2144	destroy_swap_extents(p);
2145	swap_cgroup_swapoff(type);
2146bad_swap_2:
2147	spin_lock(&swap_lock);
2148	p->swap_file = NULL;
2149	p->flags = 0;
2150	spin_unlock(&swap_lock);
2151	vfree(swap_map);
2152	if (swap_file) {
2153		if (did_down) {
2154			mutex_unlock(&inode->i_mutex);
2155			did_down = 0;
2156		}
2157		filp_close(swap_file, NULL);
2158	}
2159out:
2160	if (page && !IS_ERR(page)) {
2161		kunmap(page);
2162		page_cache_release(page);
2163	}
2164	if (name)
2165		putname(name);
2166	if (did_down) {
2167		if (!error)
2168			inode->i_flags |= S_SWAPFILE;
2169		mutex_unlock(&inode->i_mutex);
2170	}
2171	return error;
2172}
2173
2174void si_swapinfo(struct sysinfo *val)
2175{
2176	unsigned int type;
2177	unsigned long nr_to_be_unused = 0;
2178
2179	spin_lock(&swap_lock);
2180	for (type = 0; type < nr_swapfiles; type++) {
2181		struct swap_info_struct *si = swap_info[type];
2182
2183		if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2184			nr_to_be_unused += si->inuse_pages;
2185	}
2186	val->freeswap = nr_swap_pages + nr_to_be_unused;
2187	val->totalswap = total_swap_pages + nr_to_be_unused;
2188	spin_unlock(&swap_lock);
2189}
2190
2191/*
2192 * Verify that a swap entry is valid and increment its swap map count.
2193 *
2194 * Returns error code in following case.
2195 * - success -> 0
2196 * - swp_entry is invalid -> EINVAL
2197 * - swp_entry is migration entry -> EINVAL
2198 * - swap-cache reference is requested but there is already one. -> EEXIST
2199 * - swap-cache reference is requested but the entry is not used. -> ENOENT
2200 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
2201 */
2202static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2203{
2204	struct swap_info_struct *p;
2205	unsigned long offset, type;
2206	unsigned char count;
2207	unsigned char has_cache;
2208	int err = -EINVAL;
2209
2210	if (non_swap_entry(entry))
2211		goto out;
2212
2213	type = swp_type(entry);
2214	if (type >= nr_swapfiles)
2215		goto bad_file;
2216	p = swap_info[type];
2217	offset = swp_offset(entry);
2218
2219	spin_lock(&swap_lock);
2220	if (unlikely(offset >= p->max))
2221		goto unlock_out;
2222
2223	count = p->swap_map[offset];
2224	has_cache = count & SWAP_HAS_CACHE;
2225	count &= ~SWAP_HAS_CACHE;
2226	err = 0;
2227
2228	if (usage == SWAP_HAS_CACHE) {
2229
2230		/* set SWAP_HAS_CACHE if there is no cache and entry is used */
2231		if (!has_cache && count)
2232			has_cache = SWAP_HAS_CACHE;
2233		else if (has_cache)		/* someone else added cache */
2234			err = -EEXIST;
2235		else				/* no users remaining */
2236			err = -ENOENT;
2237
2238	} else if (count || has_cache) {
2239
2240		if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2241			count += usage;
2242		else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2243			err = -EINVAL;
2244		else if (swap_count_continued(p, offset, count))
2245			count = COUNT_CONTINUED;
2246		else
2247			err = -ENOMEM;
2248	} else
2249		err = -ENOENT;			/* unused swap entry */
2250
2251	p->swap_map[offset] = count | has_cache;
2252
2253unlock_out:
2254	spin_unlock(&swap_lock);
2255out:
2256	return err;
2257
2258bad_file:
2259	printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
2260	goto out;
2261}
2262
2263/*
2264 * Help swapoff by noting that swap entry belongs to shmem/tmpfs
2265 * (in which case its reference count is never incremented).
2266 */
2267void swap_shmem_alloc(swp_entry_t entry)
2268{
2269	__swap_duplicate(entry, SWAP_MAP_SHMEM);
2270}
2271
2272/*
2273 * Increase reference count of swap entry by 1.
2274 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
2275 * but could not be atomically allocated.  Returns 0, just as if it succeeded,
2276 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
2277 * might occur if a page table entry has got corrupted.
2278 */
2279int swap_duplicate(swp_entry_t entry)
2280{
2281	int err = 0;
2282
2283	while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
2284		err = add_swap_count_continuation(entry, GFP_ATOMIC);
2285	return err;
2286}
2287
2288/*
2289 * @entry: swap entry for which we allocate swap cache.
2290 *
2291 * Called when allocating swap cache for existing swap entry,
2292 * This can return error codes. Returns 0 at success.
2293 * -EBUSY means there is a swap cache.
2294 * Note: return code is different from swap_duplicate().
2295 */
2296int swapcache_prepare(swp_entry_t entry)
2297{
2298	return __swap_duplicate(entry, SWAP_HAS_CACHE);
2299}
2300
2301/*
2302 * swap_lock prevents swap_map being freed. Don't grab an extra
2303 * reference on the swaphandle, it doesn't matter if it becomes unused.
2304 */
2305int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2306{
2307	struct swap_info_struct *si;
2308	int our_page_cluster = page_cluster;
2309	pgoff_t target, toff;
2310	pgoff_t base, end;
2311	int nr_pages = 0;
2312
2313	if (!our_page_cluster)	/* no readahead */
2314		return 0;
2315
2316	si = swap_info[swp_type(entry)];
2317	target = swp_offset(entry);
2318	base = (target >> our_page_cluster) << our_page_cluster;
2319	end = base + (1 << our_page_cluster);
2320	if (!base)		/* first page is swap header */
2321		base++;
2322
2323	spin_lock(&swap_lock);
2324	if (end > si->max)	/* don't go beyond end of map */
2325		end = si->max;
2326
2327	/* Count contiguous allocated slots above our target */
2328	for (toff = target; ++toff < end; nr_pages++) {
2329		/* Don't read in free or bad pages */
2330		if (!si->swap_map[toff])
2331			break;
2332		if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2333			break;
2334	}
2335	/* Count contiguous allocated slots below our target */
2336	for (toff = target; --toff >= base; nr_pages++) {
2337		/* Don't read in free or bad pages */
2338		if (!si->swap_map[toff])
2339			break;
2340		if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2341			break;
2342	}
2343	spin_unlock(&swap_lock);
2344
2345	/*
2346	 * Indicate starting offset, and return number of pages to get:
2347	 * if only 1, say 0, since there's then no readahead to be done.
2348	 */
2349	*offset = ++toff;
2350	return nr_pages? ++nr_pages: 0;
2351}
2352
2353/*
2354 * add_swap_count_continuation - called when a swap count is duplicated
2355 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
2356 * page of the original vmalloc'ed swap_map, to hold the continuation count
2357 * (for that entry and for its neighbouring PAGE_SIZE swap entries).  Called
2358 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
2359 *
2360 * These continuation pages are seldom referenced: the common paths all work
2361 * on the original swap_map, only referring to a continuation page when the
2362 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
2363 *
2364 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
2365 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
2366 * can be called after dropping locks.
2367 */
2368int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2369{
2370	struct swap_info_struct *si;
2371	struct page *head;
2372	struct page *page;
2373	struct page *list_page;
2374	pgoff_t offset;
2375	unsigned char count;
2376
2377	/*
2378	 * When debugging, it's easier to use __GFP_ZERO here; but it's better
2379	 * for latency not to zero a page while GFP_ATOMIC and holding locks.
2380	 */
2381	page = alloc_page(gfp_mask | __GFP_HIGHMEM);
2382
2383	si = swap_info_get(entry);
2384	if (!si) {
2385		/*
2386		 * An acceptable race has occurred since the failing
2387		 * __swap_duplicate(): the swap entry has been freed,
2388		 * perhaps even the whole swap_map cleared for swapoff.
2389		 */
2390		goto outer;
2391	}
2392
2393	offset = swp_offset(entry);
2394	count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
2395
2396	if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
2397		/*
2398		 * The higher the swap count, the more likely it is that tasks
2399		 * will race to add swap count continuation: we need to avoid
2400		 * over-provisioning.
2401		 */
2402		goto out;
2403	}
2404
2405	if (!page) {
2406		spin_unlock(&swap_lock);
2407		return -ENOMEM;
2408	}
2409
2410	/*
2411	 * We are fortunate that although vmalloc_to_page uses pte_offset_map,
2412	 * no architecture is using highmem pages for kernel pagetables: so it
2413	 * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps.
2414	 */
2415	head = vmalloc_to_page(si->swap_map + offset);
2416	offset &= ~PAGE_MASK;
2417
2418	/*
2419	 * Page allocation does not initialize the page's lru field,
2420	 * but it does always reset its private field.
2421	 */
2422	if (!page_private(head)) {
2423		BUG_ON(count & COUNT_CONTINUED);
2424		INIT_LIST_HEAD(&head->lru);
2425		set_page_private(head, SWP_CONTINUED);
2426		si->flags |= SWP_CONTINUED;
2427	}
2428
2429	list_for_each_entry(list_page, &head->lru, lru) {
2430		unsigned char *map;
2431
2432		/*
2433		 * If the previous map said no continuation, but we've found
2434		 * a continuation page, free our allocation and use this one.
2435		 */
2436		if (!(count & COUNT_CONTINUED))
2437			goto out;
2438
2439		map = kmap_atomic(list_page, KM_USER0) + offset;
2440		count = *map;
2441		kunmap_atomic(map, KM_USER0);
2442
2443		/*
2444		 * If this continuation count now has some space in it,
2445		 * free our allocation and use this one.
2446		 */
2447		if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2448			goto out;
2449	}
2450
2451	list_add_tail(&page->lru, &head->lru);
2452	page = NULL;			/* now it's attached, don't free it */
2453out:
2454	spin_unlock(&swap_lock);
2455outer:
2456	if (page)
2457		__free_page(page);
2458	return 0;
2459}
2460
2461/*
2462 * swap_count_continued - when the original swap_map count is incremented
2463 * from SWAP_MAP_MAX, check if there is already a continuation page to carry
2464 * into, carry if so, or else fail until a new continuation page is allocated;
2465 * when the original swap_map count is decremented from 0 with continuation,
2466 * borrow from the continuation and report whether it still holds more.
2467 * Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
2468 */
2469static bool swap_count_continued(struct swap_info_struct *si,
2470				 pgoff_t offset, unsigned char count)
2471{
2472	struct page *head;
2473	struct page *page;
2474	unsigned char *map;
2475
2476	head = vmalloc_to_page(si->swap_map + offset);
2477	if (page_private(head) != SWP_CONTINUED) {
2478		BUG_ON(count & COUNT_CONTINUED);
2479		return false;		/* need to add count continuation */
2480	}
2481
2482	offset &= ~PAGE_MASK;
2483	page = list_entry(head->lru.next, struct page, lru);
2484	map = kmap_atomic(page, KM_USER0) + offset;
2485
2486	if (count == SWAP_MAP_MAX)	/* initial increment from swap_map */
2487		goto init_map;		/* jump over SWAP_CONT_MAX checks */
2488
2489	if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
2490		/*
2491		 * Think of how you add 1 to 999
2492		 */
2493		while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2494			kunmap_atomic(map, KM_USER0);
2495			page = list_entry(page->lru.next, struct page, lru);
2496			BUG_ON(page == head);
2497			map = kmap_atomic(page, KM_USER0) + offset;
2498		}
2499		if (*map == SWAP_CONT_MAX) {
2500			kunmap_atomic(map, KM_USER0);
2501			page = list_entry(page->lru.next, struct page, lru);
2502			if (page == head)
2503				return false;	/* add count continuation */
2504			map = kmap_atomic(page, KM_USER0) + offset;
2505init_map:		*map = 0;		/* we didn't zero the page */
2506		}
2507		*map += 1;
2508		kunmap_atomic(map, KM_USER0);
2509		page = list_entry(page->lru.prev, struct page, lru);
2510		while (page != head) {
2511			map = kmap_atomic(page, KM_USER0) + offset;
2512			*map = COUNT_CONTINUED;
2513			kunmap_atomic(map, KM_USER0);
2514			page = list_entry(page->lru.prev, struct page, lru);
2515		}
2516		return true;			/* incremented */
2517
2518	} else {				/* decrementing */
2519		/*
2520		 * Think of how you subtract 1 from 1000
2521		 */
2522		BUG_ON(count != COUNT_CONTINUED);
2523		while (*map == COUNT_CONTINUED) {
2524			kunmap_atomic(map, KM_USER0);
2525			page = list_entry(page->lru.next, struct page, lru);
2526			BUG_ON(page == head);
2527			map = kmap_atomic(page, KM_USER0) + offset;
2528		}
2529		BUG_ON(*map == 0);
2530		*map -= 1;
2531		if (*map == 0)
2532			count = 0;
2533		kunmap_atomic(map, KM_USER0);
2534		page = list_entry(page->lru.prev, struct page, lru);
2535		while (page != head) {
2536			map = kmap_atomic(page, KM_USER0) + offset;
2537			*map = SWAP_CONT_MAX | count;
2538			count = COUNT_CONTINUED;
2539			kunmap_atomic(map, KM_USER0);
2540			page = list_entry(page->lru.prev, struct page, lru);
2541		}
2542		return count == COUNT_CONTINUED;
2543	}
2544}
2545
2546/*
2547 * free_swap_count_continuations - swapoff free all the continuation pages
2548 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
2549 */
2550static void free_swap_count_continuations(struct swap_info_struct *si)
2551{
2552	pgoff_t offset;
2553
2554	for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
2555		struct page *head;
2556		head = vmalloc_to_page(si->swap_map + offset);
2557		if (page_private(head)) {
2558			struct list_head *this, *next;
2559			list_for_each_safe(this, next, &head->lru) {
2560				struct page *page;
2561				page = list_entry(this, struct page, lru);
2562				list_del(this);
2563				__free_page(page);
2564			}
2565		}
2566	}
2567}
2568